%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.56", %%% date = "17 March 2026", %%% time = "15:19:58 MDT", %%% filename = "trets.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "64414 24340 123117 1188626", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Reconfigurable Technology %%% and Systems; bibliography; TRETS", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Reconfigurable Technology %%% and Systems (CODEN ????, ISSN 1936-7406 %%% (print), 1936-7414 (electronic)), covering %%% all journal issues from 2008 -- date. %%% %%% At version 1.56, the COMPLETE journal %%% coverage looked like this: %%% %%% 2008 ( 17) 2015 ( 44) 2022 ( 52) %%% 2009 ( 33) 2016 ( 29) 2023 ( 64) %%% 2010 ( 37) 2017 ( 20) 2024 ( 62) %%% 2011 ( 29) 2018 ( 28) 2025 ( 58) %%% 2012 ( 22) 2019 ( 20) 2026 ( 13) %%% 2013 ( 19) 2020 ( 21) %%% 2014 ( 27) 2021 ( 20) %%% %%% Article: 615 %%% %%% Total entries: 615 %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/trets/ %%% http://portal.acm.org/toc.cfm?id=J1151 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\ifx \undefined \circled \def \circled #1{(#1)} \fi" # "\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" # "\ifx \undefined \reg \def \reg {\circled{R}} \fi" # "\def \TM {${}^{\sc TM}$}" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TRETS = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)"} %%% ==================================================================== %%% Bibliography entries: @Article{Buell:2008:I, author = "Duncan Buell and Wayne Luk", title = "Introduction", journal = j-TRETS, volume = "1", number = "1", pages = "1:1--1:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331898", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{DeHon:2008:GET, author = "Andr{\'e} DeHon and Mike Hutton", title = "Guest Editorial: {TRETS} Special Edition on the {15th International Symposium on FPGAs}", journal = j-TRETS, volume = "1", number = "1", pages = "2:1--2:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1341292", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Matsumoto:2008:SID, author = "Yohei Matsumoto and Masakazu Hioki and Takashi Kawanami and Hanpei Koike and Toshiyuki Tsutsumi and Tadashi Nakagawa and Toshihiro Sekigawa", title = "Suppression of Intrinsic Delay Variation in {FPGAs} using Multiple Configurations", journal = j-TRETS, volume = "1", number = "1", pages = "3:1--3:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331899", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A new method for improving the timing yield of field-programmable gate array (FPGA) devices affected by intrinsic within-die variation is proposed. The timing variation is reduced by selecting an appropriate configuration for each chip from a set of independent configurations, the critical paths of which do not share the same circuit resources on the FPGA. In this article, the actual method used to generate independent multiple configurations by simply repeating the routing phase is shown, along with the results of Monte Carlo simulation with 10,000 samples. One simulation result showed that the standard deviations of maximum critical path delays are reduced by 28\% and 49\% for 10\% and 30\% V$_{th}$ variations ($ \sigma / \mu $ ), respectively, with 10 independent configurations. Therefore, the proposed method is especially effective for larger V$_{th}$ variation and is expected to be useful for suppressing the performance variation of FPGAs due to the future increase of parameter variation. Another simulation result showed that the effectiveness of the proposed technique was saturated at the use of 10 or more configurations because of the degradation of the quality of the configurations. Therefore, the use of 10 or fewer configurations is reasonable.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "configuration; FPGA; timing yield; within-die variation", } @Article{Sivaswamy:2008:SAP, author = "Satish Sivaswamy and Kia Bazargan", title = "Statistical Analysis and Process Variation-Aware Routing and Skew Assignment for {FPGAs}", journal = j-TRETS, volume = "1", number = "1", pages = "4:1--4:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331900", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "With constant scaling of process technologies, chip design is becoming increasingly difficult due to process variations. The FPGA community has only recently started focusing on the effects of variations. In this work we present a statistical analysis to compare the effects of variations on designs mapped to FPGAs and ASICs. We also present CAD and architecture techniques to mitigate the impact of variations. First we present a variation-aware router that optimizes statistical criticality. We then propose a modification to the clock network to deliver programmable skews to different flip-flops. Finally, we combine the two techniques and the result is a 9x reduction in yield loss that translates to a 12\% improvement in timing yield. When the desired timing yield is set to 99\%, our combined statistical routing and skew assignment technique results in a delay improvement of about 10\% over a purely deterministic approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "routing; skew assignment; statistical timing analysis", } @Article{Lu:2008:DCR, author = "Shih-Lien L. Lu and Peter Yiannacouras and Taeweon Suh and Rolf Kassa and Michael Konow", title = "A Desktop Computer with a Reconfigurable {Pentium\reg}", journal = j-TRETS, volume = "1", number = "1", pages = "5:1--5:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331901", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Advancements in reconfigurable technologies, specifically FPGAs, have yielded faster, more power-efficient reconfigurable devices with enormous capacities. In our work, we provide testament to the impressive capacity of recent FPGAs by hosting a complete Pentium$^{\reg }$ in a single FPGA chip. In addition we demonstrate how FPGAs can be used for microprocessor design space exploration while overcoming the tension between simulation speed, model accuracy, and model completeness found in traditional software simulator environments. Specifically, we perform preliminary experimentation/prototyping with an original Socket 7 based desktop processor system with typical hardware peripherals running modern operating systems such as Fedora Core 4 and Windows XP; however we have inserted a Xilinx Virtex-4 in place of the processor that should sit in the motherboard and have used the Virtex-4 to host a complete version of the Pentium$^{\reg }$ microprocessor (which consumes less than half its resources). We can therefore apply architectural changes to the processor and evaluate their effects on the complete desktop system. We use this FPGA-based emulation system to conduct preliminary architectural experiments including growing the branch target buffer and the level 1 caches. In addition, we experimented with interfacing hardware accelerators such as DES and AES engines which resulted in a 27x speedup.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "accelerator; architecture; emulator; exploration; FPGA; model; operating system; Pentium processor; reconfigurable; simulator", } @Article{Feng:2008:DEI, author = "Wenyi Feng and Sinan Kaptanoglu", title = "Designing Efficient Input Interconnect Blocks for {LUT} Clusters Using Counting and Entropy", journal = j-TRETS, volume = "1", number = "1", pages = "6:1--6:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331902", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In a cluster-based FPGA, the interconnect from external routing tracks and cluster feedbacks to the LUT inputs consumes significant area, and no consensus has emerged among different implementations (e.g., 1-level or 2-level). In this paper, we model this interconnect as a unified input interconnect block (IIB). We identify three types of IIBs and develop general combinatorial techniques to count the number of distinct functional configurations for them. We use entropy, defined as the logarithm of this count, to estimate an IIB's routing flexibility. This enables us to analytically evaluate different IIBs without the customary time-consuming place and route experiments. We show that both depopulated 1-level IIBs and VPR-style 2-level IIBs achieve high routing flexibility but lack area efficiency. We propose a novel class of highly efficient, yet still simple, IIBs that use substantially fewer switches with only a small degradation in routing flexibility. Experimental results verify the routability of these IIBs, and confirm that entropy is a good predictor of routability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "cluster; counting; entropy; FPGAs; interconnect; LUT; PLDs", } @Article{Wilton:2008:SDO, author = "Steven J. E. Wilton and Chun Hok Ho and Bradley Quinton and Philip H. W. Leong and Wayne Luk", title = "A Synthesizable Datapath-Oriented Embedded {FPGA} Fabric for Silicon Debug Applications", journal = j-TRETS, volume = "1", number = "1", pages = "7:1--7:??", month = mar, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331897.1331903", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:41 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We present an architecture for a synthesizable datapath-oriented FPGA core that can be used to provide post-fabrication flexibility to an SoC. Our architecture is optimized for bus-based operations and employs a directional routing architecture, which allows it to be synthesized using standard ASIC design tools and flows. The primary motivation for this architecture is to provide an efficient mechanism to support on-chip debugging. The fabric can also be used to implement other datapath-oriented circuits such as those needed in signal processing and computation-intensive applications. We evaluate our architecture using a set of benchmark circuits and compare it to previous fabrics in terms of area, speed, and power.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Field programmable gate array; integrated circuit; silicon debug; system-on-chip", } @Article{Guneysu:2008:SPH, author = "Tim G{\"u}neysu and Christof Paar and Jan Pelzl", title = "Special-Purpose Hardware for Solving the Elliptic Curve Discrete Logarithm Problem", journal = j-TRETS, volume = "1", number = "2", pages = "8:1--8:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1371579.1371580", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:42 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The resistance against powerful index-calculus attacks makes Elliptic Curve Cryptosystems (ECC) an interesting alternative to conventional asymmetric cryptosystems, like RSA. Operands in ECC require significantly less bits at the same level of security, resulting in a higher computational efficiency compared to RSA. With growing computational capabilities and continuous technological improvements over the years, however, the question of the security of ECC against attacks based on special-purpose hardware arises. In this context, recently emerged low-cost FPGAs demand for attention in the domain of hardware-based cryptanalysis: the extraordinary efficiency of modern programmable hardware devices allow for a low-budget implementation of hardware-based ECC attacks---without the requirement of the expensive development of ASICs.\par With focus on the aspect of cost-efficiency, this contribution presents and analyzes an FPGA-based architecture of an attack against ECC over prime fields. A multi-processing hardware architecture for Pollard's Rho method is described. We provide results on actually used key lengths of ECC (128 bits and above) and estimate the expected runtime for a successful attack.\par As a first result, currently used elliptic curve cryptosystems with a security of 160 bit and above turn out to be infeasible to break with available computational and financial resources. However, some of the security standards proposed by the Standards for Efficient Cryptography Group (SECG) become subject to attacks based on low-cost FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "cryptanalysis; discrete logarithm; elliptic curve cryptosystem; Pollard's rho", } @Article{Jacob:2008:MBA, author = "Arpith Jacob and Joseph Lancaster and Jeremy Buhler and Brandon Harris and Roger D. Chamberlain", title = "{Mercury BLASTP}: Accelerating Protein Sequence Alignment", journal = j-TRETS, volume = "1", number = "2", pages = "9:1--9:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1371579.1371581", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:42 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Large-scale protein sequence comparison is an important but compute-intensive task in molecular biology. BLASTP is the most popular tool for comparative analysis of protein sequences. In recent years, an exponential increase in the size of protein sequence databases has required either exponentially more running time or a cluster of machines to keep pace. To address this problem, we have designed and built a high-performance FPGA-accelerated version of BLASTP, {\em Mercury BLASTP}. In this article, we describe the architecture of the portions of the application that are accelerated in the FPGA, and we also describe the integration of these FPGA-accelerated portions with the existing BLASTP software. We have implemented Mercury BLASTP on a commodity workstation with two Xilinx Virtex-II 6000 FPGAs. We show that the new design runs 11--15 times faster than software BLASTP on a modern CPU while delivering close to 99\% identical results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "bioinformatics; biological sequence alignment", } @Article{Sedcole:2008:PYM, author = "Pete Sedcole and Peter Y. K. Cheung", title = "Parametric Yield Modeling and Simulations of {FPGA} Circuits Considering Within-Die Delay Variations", journal = j-TRETS, volume = "1", number = "2", pages = "10:1--10:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1371579.1371582", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:42 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Variations in the semiconductor fabrication process results in differences in parameters between transistors on the same die, a problem exacerbated by lithographic scaling. Field-Programmable Gate Arrays may be able to compensate for within-die delay variability, by judicious use of reconfigurability. This article presents two strategies for compensating within-die stochastic delay variability by using reconfiguration: reconfiguring the entire FPGA, and relocating subcircuits within an FPGA. Analytical models for the theoretical bounds on the achievable gains are derived for both strategies and compared to models for worst-case design as well as statistical static timing analysis (SSTA). All models are validated by comparison to circuit-level Monte Carlo simulations. It is demonstrated that significant improvements in circuit yield and timing are possible using SSTA alone, and these improvements can be enhanced by employing reconfiguration-based techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "delay; FPGA; modeling; process variation; reconfiguration; statistical theory; within-die variability; yield", } @Article{Gorjiara:2008:MDC, author = "Bita Gorjiara and Mehrdad Reshadi and Daniel Gajski", title = "Merged Dictionary Code Compression for {FPGA} Implementation of Custom Microcoded {PEs}", journal = j-TRETS, volume = "1", number = "2", pages = "11:1--11:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1371579.1371583", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:42 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Horizontal Microcoded Architecture (HMA) is a paradigm for designing programmable high-performance processing elements (PEs). However, it suffers from large code size, which can be addressed by compression. In this article, we study the code size of one of the new HMA-based technologies called No-Instruction-Set Computer (NISC). We show that NISC code size can be several times larger than a typical RISC processor, and we propose several low-overhead dictionary-based code compression techniques to reduce its code size. Our compression algorithm leverages the knowledge of ``don't care'' values in the control words and can reduce the code size by 3.3 times, on average. Despite such good results, as shown in this article, these compression techniques lead to poor FPGA implementations because they require many on-chip RAMs. To address this issue, we introduce an FPGA-aware dictionary-based technique that uses the dual-port feature of on-chip RAMs to reduce the number of utilized block RAMs by half. Additionally, we propose cascading two-levels of dictionaries for code size and block RAM reduction of large programs. For an MP3 application, a merged, cascaded, three-dictionary implementation reduces the number of utilized block RAMs by 4.3 times (76\%) compared to a NISC without compression. This corresponds to 20\% additional savings over the best single level dictionary-based compression.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "dictionary based compression; FPGA; memory optimization; microcoded architectures; no-instruction-set computer", } @Article{Thomas:2008:MGR, author = "David B. Thomas and Wayne Luk", title = "Multivariate {Gaussian} Random Number Generation Targeting Reconfigurable Hardware", journal = j-TRETS, volume = "1", number = "2", pages = "12:1--12:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1371579.1371584", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:42 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The multivariate Gaussian distribution is often used to model correlations between stochastic time-series, and can be used to explore the effect of these correlations across $N$ time-series in Monte-Carlo simulations. However, generating random correlated vectors is an $ O(N^2) $ process, and quickly becomes a computational bottleneck in software simulations. This article presents an efficient method for generating vectors in parallel hardware, using $N$ parallel pipelined components to generate a new vector every $N$ cycles. This method maps well to the embedded block RAMs and multipliers in contemporary FPGAs, particularly as extensive testing shows that the limited bit-width arithmetic does not reduce the statistical quality of the generated vectors. An implementation of the architecture in the Virtex-4 architecture achieves a 500MHz clock-rate, and can support vector lengths up to 512 in the largest devices. The combination of a high clock-rate and parallelism provides a significant performance advantage over conventional processors, with an xc4vsx55 device at 500MHz providing a 200 times speedup over an Opteron 2.6GHz using an AMD optimised BLAS package. In a case study in Delta-Gamma Value-at Risk, an RC2000 accelerator card using an xc4vsx55 at 400MHz is 26 times faster than a quad Opteron 2.6GHz SMP.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "FPGA; multivariate Gaussian distribution; random numbers", } @Article{Lamoureux:2008:TBP, author = "Julien Lamoureux and Steven J. E. Wilton", title = "On the trade-off between power and flexibility of {FPGA} clock networks", journal = j-TRETS, volume = "1", number = "3", pages = "13:1--13:??", month = sep, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1391732.1391733", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:44 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGA clock networks consume a significant amount of power, since they toggle every clock cycle and must be flexible enough to implement the clocks for a wide range of different applications. The efficiency of FPGA clock networks can be improved by reducing this flexibility; however, reducing the flexibility introduces stricter constraints during the clustering and placement stages of the FPGA CAD flow. These constraints can reduce the overall efficiency of the final implementation. This article examines the trade-off between the power consumption and flexibility of FPGA clock networks.\par Specifically, this article makes three contributions. First, it presents a new parameterized clock-network framework for describing and comparing FPGA clock networks. Second, it describes new clock-aware placement techniques that are needed to find a legal placement satisfying the constraints imposed by the clock network. Finally, it performs an empirical study to examine the trade-off between the power consumption of the clock network and the impact of the CAD constraints for a number of different clock networks with varying amounts of flexibility.\par The results show that the techniques used to produce a legal placement can have a significant influence on power and the ability of the placer to find a legal solution. On average, circuits placed using the most effective techniques dissipate 5\% less overall energy and are significantly more likely to be legal than circuits placed using other techniques. Moreover, the results show that the architecture of the clock network is also important. On average, FPGAs with an efficient clock network are up to 14.6\% more energy efficient compared to other FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "clock distribution networks; clock-aware placement; FPGA; low-power design", } @Article{Slogsnat:2008:OSH, author = "David Slogsnat and Alexander Giese and Mondrian N{\"u}ssle and Ulrich Br{\"u}ning", title = "An open-source {HyperTransport} core", journal = j-TRETS, volume = "1", number = "3", pages = "14:1--14:??", month = sep, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1391732.1391734", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:44 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents the design of a generic HyperTransport (HT) core. HyperTransport is a packet-based interconnect technology for low-latency, high-bandwidth point-to-point connections. It is specially optimized to achieve a very low latency. The core has been verified in system using an FPGA. This exhaustive verification and the generic design allow the mapping to both ASICs and FPGAs. The implementation described in this work supports a 16-bit link width, as used by Opteron processors. On a Xilinx Virtex-4 FX60, the core supports a link frequency of 400 MHz DDR and offers a maximum bidirectional bandwidth of 3.2GB/s. The in-system verification has been performed using a custom FPGA board that has been plugged into a HyperTransport extension connector (HTX) of a standard Opteron-based motherboard. HTX slots in Opteron-based motherboards allow very high-bandwidth, low-latency communication, since the HTX device is directly connected to one of the HyperTransport links of the processor. Performance analysis shows a unidirectional payload bandwidth of 1.4GB/s and a read latency of 180 ns. The HT core in combination with the HTX board is an ideal base for prototyping systems and implementing FPGA coprocessors. The HT core is available as open source.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "FPGA; HTX; HyperTransport; prototyping; RTL", } @Article{Beeckler:2008:PGR, author = "John S. Beeckler and Warren J. Gross", title = "Particle graphics on reconfigurable hardware", journal = j-TRETS, volume = "1", number = "3", pages = "15:1--15:??", month = sep, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1391732.1391735", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:44 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Particle graphics simulations are well suited for modeling complex phenomena such as water, cloth, explosions, fire, smoke, and clouds. They are normally realized in software as part of an interactive graphics application. The computational complexity of particle graphics simulations restricts the number of particles that can be updated in software at interactive frame rates. This article presents the design and implementation of a hardware particle graphics engine for accelerating real-time particle graphics simulations. We explore the design process, implementation issues, and limitations of using field-programmable gate arrays (FPGAs) for the acceleration of particle graphics. The FPGA particle engine processes million-particle systems at a rate from 47 to 112 million particles per second, which represents one to two orders of magnitude speedup over a 2.8 GHz CPU. Using three FPGAs, a maximum sustained performance of 112 million particles per second was achieved.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "FPGAs; particle systems; reconfigurable computing; special-purpose architectures", } @Article{Grant:2008:PMS, author = "David Grant and Guy Lemieux", title = "Perturb $+$ mutate: Semisynthetic circuit generation for incremental placement and routing", journal = j-TRETS, volume = "1", number = "3", pages = "16:1--16:??", month = sep, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1391732.1391736", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:44 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "CAD tool designers are always searching for more benchmark circuits to stress their software. In this article we present a heuristic method to generate benchmark circuits specially suited for incremental place-and-route tools. The method removes part of a real circuit and replaces it with an altered version of the same circuit to mimic an incremental design change. The alteration consists of two steps: {\em mutate\/} followed by {\em perturb}. The perturb step exactly preserves as many circuit characteristics as possible. While perturbing, reproduction of interconnect locality, a characteristic that is difficult to measure reliably or reproduce exactly, is controlled using a new technique, {\em ancestor depth control\/} (ADC). Perturbing with ADC produces circuits with postrouting properties that match the best techniques known to-date. The mutate step produces targetted mutations resulting in controlled changes to specific circuit properties (while keeping other properties constant). We demonstrate one targetted mutation heuristic, scale, to significantly change circuit size with little change to other circuit characteristics. The method is simple enough for inclusion in a CAD tool directly, and fast enough for use in on-the-fly benchmark generation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "automated development tools; design automation; graph algorithms; hardware-supporting software; place and route; testing", } @Article{Hsiung:2008:PSB, author = "Pao-Ann Hsiung and Chao-Sheng Lin and Chih-Feng Liao", title = "{Perfecto}: a {SystemC}-based design-space exploration framework for dynamically reconfigurable architectures", journal = j-TRETS, volume = "1", number = "3", pages = "17:1--17:??", month = sep, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1391732.1391737", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 4 17:12:44 MST 2008", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "To cope with increasing demands for higher computational power and greater system flexibility, dynamically and partially reconfigurable logic has started to play an important role in embedded systems and systems-on-chip (SoC). However, when using traditional design methods and tools, it is difficult to estimate or analyze the performance impact of including such reconfigurable logic devices into a system design. In this work, we present a system-level framework, called Perfecto, which is able to perform rapid exploration of different reconfigurable design alternatives and to detect system performance bottlenecks. This framework is based on the popular IEEE standard system-level design language SystemC, which is supported by most EDA and ESL tools. Given an architecture model and an application model, Perfecto uses SystemC {\em transaction-level models\/} (TLMs) to simulate the system design alternatives automatically. Different hardware-software copartitioning, coscheduling, and placement algorithms can be embedded into the framework for analysis; thus, Perfecto can also be used to design the algorithms to be used in an operating system for reconfigurable systems. Applications to a simple illustration example and a network security system have shown how Perfecto helps a designer make intelligent partition decisions, optimize system performance, and evaluate task placements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "design-space exploration; partitioning; performance evaluation; placement; reconfigurable systems; scheduling", } @Article{Chin:2009:SDM, author = "Scott Y. L. Chin and Steven J. E. Wilton", title = "Static and Dynamic Memory Footprint Reduction for {FPGA} Routing Algorithms", journal = j-TRETS, volume = "1", number = "4", pages = "18:1--18:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462587", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents techniques to reduce the static and dynamic memory requirements of routing algorithms that target field-programmable gate arrays. During routing, memory is required to store both architectural data and temporary routing data. The architectural data is static, and provides a representation of the physical routing resources and programmable connections on the device. We show that by taking advantage of the regularity in FPGAs, we can reduce the amount of information that must be explicitly represented, leading to significant memory savings. The temporary routing data is dynamic, and contains scoring parameters and traceback information for each routing resource in the FPGA. By studying the lifespan of the temporary routing data objects, we develop several memory management schemes to reduce this component. To make our proposals concrete, we applied them to the routing algorithm in VPR and empirically quantified the impact on runtime memory footprint, and place and route time.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "CAD; FPGA; memory; routing; scalability", } @Article{Xu:2009:FAR, author = "Ning-Yi Xu and Xiong-Fei Cai and Rui Gao and Lei Zhang and Feng-Hsiung Hsu", title = "{FPGA} Acceleration of {RankBoost} in {Web} Search Engines", journal = j-TRETS, volume = "1", number = "4", pages = "19:1--19:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462588", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Search relevance is a key measurement for the usefulness of search engines. Shift of search relevance among search engines can easily change a search company's market cap by tens of billions of dollars. With the ever-increasing scale of the Web, machine learning technologies have become important tools to improve search relevance ranking. RankBoost is a promising algorithm in this area, but it is not widely used due to its long training time. To reduce the computation time for RankBoost, we designed a FPGA-based accelerator system and its upgraded version. The accelerator, plugged into a commodity PC, increased the training speed on MSN search engine data up to 1800x compared to the original software implementation on a server. The proposed accelerator has been successfully used by researchers in the search relevance ranking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "FPGA; hardware acceleration", } @Article{Patterson:2009:STP, author = "C. D. Patterson and S. W. Ellingson and B. S. Martin and K. Deshpande and J. H. Simonetti and M. Kavic and S. E. Cutchin", title = "Searching for Transient Pulses with the {ETA} Radio Telescope", journal = j-TRETS, volume = "1", number = "4", pages = "20:1--20:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462589", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Array-based, direct-sampling radio telescopes have computational and communication requirements unsuited to conventional computer and cluster architectures. Synchronization must be strictly maintained across a large number of parallel data streams, from A/D conversion, through operations such as beamforming, to dataset recording. FPGAs supporting multigigabit serial I/O are ideally suited to this application. We describe a recently-constructed radio telescope called ETA having all-sky observing capability for detecting low frequency pulses from transient events such as gamma ray bursts and primordial black hole explosions. Signals from 24 dipole antennas are processed by a tiered arrangement of 28 commercial FPGA boards and 4 PCs with FPGA-based data acquisition cards, connected with custom I/O adapter boards supporting InfiniBand and LVDS physical links. ETA is designed for unattended operation, allowing configuration and recording to be controlled remotely.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Direct sampling radio telescope array; FPGA cluster computing; RFI mitigation; signal dedispersion", } @Article{El-Araby:2009:EPR, author = "Esam El-Araby and Ivan Gonzalez and Tarek El-Ghazawi", title = "Exploiting Partial Runtime Reconfiguration for High-Performance Reconfigurable Computing", journal = j-TRETS, volume = "1", number = "4", pages = "21:1--21:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462590", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Runtime Reconfiguration (RTR) has been traditionally utilized as a means for exploiting the flexibility of High-Performance Reconfigurable Computers (HPRCs). However, the RTR feature comes with the cost of high configuration overhead which might negatively impact the overall performance. Currently, modern FPGAs have more advanced mechanisms for reducing the configuration overheads, particularly Partial Runtime Reconfiguration (PRTR). It has been perceived that PRTR on HPRC systems can be the trend for improving the performance. In this work, we will investigate the potential of PRTR on HPRC by formally analyzing the execution model and experimentally verifying our analytical findings by enabling PRTR for the first time, to the best of our knowledge, on one of the current HPRC systems, Cray XD1. Our approach is general and can be applied to any of the available HPRC systems. The paper will conclude with recommendations and conditions, based on our conceptual and experimental work, for the optimal utilization of PRTR as well as possible future usage in HPRC.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "dynamic partial reconfiguration; field programmable gate arrays (FPGA); High performance computing; reconfigurable computing", } @Article{Holland:2009:RRA, author = "Brian Holland and Karthik Nagarajan and Alan D. George", title = "{RAT}: {RC} Amenability Test for Rapid Performance Prediction", journal = j-TRETS, volume = "1", number = "4", pages = "22:1--22:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462591", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "While the promise of achieving speedup and additional benefits such as high performance per watt with FPGAs continues to expand, chief among the challenges with the emerging paradigm of reconfigurable computing is the complexity in application design and implementation. Before a lengthy development effort is undertaken to map a given application to hardware, it is important that a high-level parallel algorithm crafted for that application first be analyzed relative to the target platform, so as to ascertain the likelihood of success in terms of potential speedup. This article presents the RC Amenability Test, or RAT, a methodology and model developed for this purpose, supporting rapid exploration and prediction of strategic design tradeoffs during the formulation stage of application development.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "formulation methodology; FPGA; performance prediction; reconfigurable computing; strategic design methodology", } @Article{Murtaza:2009:CBB, author = "S. Murtaza and A. G. Hoekstra and P. M. A. Sloot", title = "Compute Bound and {I/O} Bound Cellular Automata Simulations on {FPGA} Logic", journal = j-TRETS, volume = "1", number = "4", pages = "23:1--23:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462592", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGA-based computation engines have been used as Cellular Automata accelerators in the scientific community for some time now. With the recent availability of more advanced FPGA logic it becomes necessary to better understand the mapping of Cellular Automata to these systems. There are many trade-offs to consider when mapping a Cellular Automata algorithm from an abstract system to the physical implementation using FPGA logic. The trade-offs include both the available FPGA resources and the Cellular Automata algorithm's execution time. The most important aspect is to fully understand the behavior of the specified CA algorithm in terms of its execution times which are either compute bound or I/O bound. In this article, we present a methodology to categorize a specified CA algorithm as a compute bound or an I/O bound. We take the methodology further by presenting rigorous analysis for each of the two cases identifying the various parameters that control the mapping process and are defined both by the Cellular Automata algorithm and the given FPGA hardware specifications. This methodology helps to predict the performance of running Cellular Automata algorithms on specific FPGA hardware and to determine optimal values for the various parameters that control the mapping process. The model is validated for both compute and I/O bound two-dimensional Cellular Automata algorithms. We find that our model predictions are accurate within 7\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "cellular automata; FPGA-based hardware accelerator; High-performance computing; lattice Boltzman simulations", } @Article{Bouganis:2009:SOF, author = "Christos-S. Bouganis and Sung-Boem Park and George A. Constantinides and Peter Y. K. Cheung", title = "Synthesis and Optimization of {$2$D} Filter Designs for Heterogeneous {FPGAs}", journal = j-TRETS, volume = "1", number = "4", pages = "24:1--24:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1462586.1462593", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:01 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Many image processing applications require fast convolution of an image with one or more 2D filters. Field-Programmable Gate Arrays (FPGAs) are often used to achieve this goal due to their fine grain parallelism and reconfigurability. However, the heterogeneous nature of modern reconfigurable devices is not usually considered during design optimization. This article proposes an algorithm that explores the space of possible implementation architectures of 2D filters, targeting the minimization of the required area, by optimizing the usage of the different components in a heterogeneous device. This is achieved by exploring the heterogeneous nature of modern reconfigurable devices using a Singular Value Decomposition based algorithm, which provides an efficient mapping of filter's implementation requirements to the heterogeneous components of modern FPGAs. In the case of multiple 2D filters, the proposed algorithm also exploits any redundancy that exists within each filter and between different filters in the set, leading to designs with minimized area. Experiments with real filter sets from computer vision applications demonstrate an average of up to 38\% reduction in the required area.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "2D filter design; FPGA; reconfigurable logic; Singular Value Decomposition", } @Article{Schaumont:2009:GEI, author = "Patrick R. Schaumont and Alex K. Jones and Steve Trimberger", title = "{Guest Editors}' Introduction to Security in Reconfigurable Systems Design", journal = j-TRETS, volume = "2", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1502782", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This special issue on Security in Reconfigurable Systems Design reports on recent research results in the design and implementation of trustworthy reconfigurable systems. Five articles cover topics including power-efficient implementation of public-key cryptography, side-channel analysis of electromagnetic radiation, side-channel resistant design, design of robust unclonable functions on an FPGA, and Trojan detection in an FPGA bitstream.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "physically unclonable function; side-channel resistant design; Trojan; Trustworthy design", } @Article{Keller:2009:ECC, author = "Maurice Keller and Andrew Byrne and William P. Marnane", title = "Elliptic Curve Cryptography on {FPGA} for Low-Power Applications", journal = j-TRETS, volume = "2", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1502783", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Elliptic curve cryptography has generated a lot of research interest due to its ability to provide greater security per bit compared to public key systems such as RSA. The designer of an elliptic curve hardware accelerator is faced with many choices at design time, each of which can impact the performance of the accelerator in different ways. There are many examples in the literature of how these design choices can effect the area and/or speed of an elliptic curve hardware accelerator. The effect of design choices on power and energy consumption in elliptic curve hardware has been less well studied. This article studies the effect of design choices on the power and energy consumption of an FPGA-based reconfigurable elliptic curve hardware accelerator. A reconfigurable processor has been used for different system parameters and the power and energy consumption measured. The power and energy results are presented and compared.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Cryptography; elliptic curves; FPGA; low-power", } @Article{McEvoy:2009:IWH, author = "Robert P. McEvoy and Colin C. Murphy and William P. Marnane and Michael Tunstall", title = "Isolated {WDDL}: a Hiding Countermeasure for Differential Power Analysis on {FPGAs}", journal = j-TRETS, volume = "2", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1502784", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Security protocols are frequently accelerated by implementing the underlying cryptographic functions in reconfigurable hardware. However, unprotected hardware implementations are susceptible to side-channel attacks, and Differential Power Analysis (DPA) has been shown to be especially powerful. In this work, we evaluate and compare the effectiveness of common hiding countermeasures against DPA in FPGA-based designs, using the Whirlpool hash function as a case study. In particular, we develop a new design flow called Isolated WDDL (IWDDL). In contrast with previous works, IWDDL isolates the direct and complementary circuit paths, and also provides DPA resistance in the Hamming distance power model. The analysis is supported using actual implementation results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "DPA; FPGA; secure logic; Side-channel attacks; Whirlpool", } @Article{Sauvage:2009:ERF, author = "Laurent Sauvage and Sylvain Guilley and Yves Mathieu", title = "Electromagnetic Radiations of {FPGAs}: High Spatial Resolution Cartography and Attack on a Cryptographic Module", journal = j-TRETS, volume = "2", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1502785", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Since the first announcement of a Side Channel Analysis (SCA) about ten years ago, considerable research has been devoted to studying these attacks on Application Specific Integrated Circuits (ASICs), such as smart cards or TPMs. In this article, we compare power-line attacks with ElectroMagnetic (EM) attacks, specifically targeting Field Programmable Gate Array devices (FPGAs), as they are becoming widely used for sensitive applications involving cryptography.\par We show experimentally that ElectroMagnetic Analysis (EMA) is always faster than the historical Differential Power Analysis (DPA) in retrieving keys of symmetric ciphers. In addition, these analyses prove to be very convenient to conduct, as they are totally non-invasive.\par Research reports indicate that EMA can be conducted globally, typically with macroscopic home-made coils circling the device under attack, with fair results. However, as accurate professional EM antennas are now becoming more accessible, it has become commonplace to carry out EM analyses locally.\par Cartography has been carried out by optical means on circuits realized with technology greater than 250 nanometers. Nonetheless, for deep submicron technologies, the feature size of devices that are spied upon is too small to be visible with photographic techniques. In addition, the presence of the 6+ metallization layers obviously prevents a direct observation of the layout. Therefore, EM imaging is emerging as a relevant means to discover the underlying device structure.\par In this article, we present the first images of deep-submicron FPGAs. The resolution is not as accurate as photographic pictures: we notably compare the layout of toy design examples placed at the four corners of the FPGAs with the EM images we collected. We observe that EM imaging has the advantage of revealing active regions, which can be useful in locating a particular processor (visible while active---invisible when inactive).\par In the context of EM attacks, we stress that the exact localization of the cryptographic target is not necessary: the coarse resolution we obtain is sufficient. We note that the EM imaging does not reveal the exact layout of the FPGA, but instead directly guides the attacker towards the areas which are leaking the most. We achieve attacks with an accurate sensor, both far from (namely on a SMC capacitor on the board) and close to (namely directly over the FPGA) the encryption co-processor. As compared to the previously published attacks, we report a successful attack on a DES module in fewer than 6,300 measurements, which is currently the best cracking performance against this encryption algorithm implemented in FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "cartography; DPA; EMA; FPGA; SCA; security", } @Article{Majzoobi:2009:TDI, author = "Mehrdad Majzoobi and Farinaz Koushanfar and Miodrag Potkonjak", title = "Techniques for Design and Implementation of Secure Reconfigurable {PUFs}", journal = j-TRETS, volume = "2", number = "1", pages = "5:1--5:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1502786", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Physically unclonable functions (PUFs) provide a basis for many security and digital rights management protocols. PUF-based security approaches have numerous comparative strengths with respect to traditional cryptography-based techniques, including resilience against physical and side channel attacks and suitability for lightweight protocols. However, classical delay-based PUF structures have a number of drawbacks including susceptibility to guessing, reverse engineering, and emulation attacks, as well as sensitivity to operational and environmental variations.\par To address these limitations, we have developed a new set of techniques for FPGA-based PUF design and implementation. We demonstrate how reconfigurability can be exploited to eliminate the stated PUF limitations. We also show how FPGA-based PUFs can be used for privacy protection. Furthermore, reconfigurability enables the introduction of new techniques for PUF testing. The effectiveness of all the proposed techniques is validated using extensive implementations, simulations, and statistical analysis.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "hardware security; physically unclonable functions; process variation; Reconfigurable systems", } @Article{Dutt:2009:TBD, author = "Shantanu Dutt and Li Li", title = "Trust-Based Design and Check of {FPGA} Circuits Using Two-Level Randomized {ECC} Structures", journal = j-TRETS, volume = "2", number = "1", pages = "6:1--6:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1502781.1508209", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 1 18:15:27 MDT 2009", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A novel trust-based design method for FPGA circuits that uses error-correcting code (ECC) structures for detecting design tampers (changes, deletion of existing logic, and addition of extradesign logic-like Trojans) is proposed in this article. We determine ECC-based CLB (configuration logic block) parity groups and embed the check CLBs for each parity group in the FPGA circuit. During a trust-checking phase, a Test-Pattern Generator (TPG) and an Output Response Analyzer (ORA), configured in the FPGA, are used to check that each parity group of CLB outputs produce the expected parities. We use two levels of randomization to thwart attempts by an adversary to discover the parity groups and inject tampers that mask each other, or to tamper with the TPG and ORA so that design tampers remain undetected: (a) randomization of the mapping of the ECC parity groups to the CLB array; (b) randomization within each parity group of odd and even parities for different input combinations (classically, all ECC parity groups have even parities across all inputs). These randomizations along with the error-detecting property of the underlying ECC lead to design tampers being uncovered with very high probabilities, as we show both analytically and empirically. We also classify different CLB function structures and impose a parity group selection in which only similarly structured functions are randomly selected to be in the same parity group in order to minimize check function complexity. Using the 2D code as our underlying ECC and its 2-level randomization, our experiments with inserting 1-10 circuit CLB tampers and 1-5 extraneous logic CLBs in two medium-size circuits and a RISC processor circuit implemented on a Xilinx Spartan-3 FPGA show promising results of 100\% tamper detection and 0\% false alarms, obtained at a hardware overhead of only 7-10\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Error-correcting codes; FPGAs; masking probability; parity groups; parity randomization; trust checking; trust-based design", } @Article{Amano:2009:GEI, author = "Hideharu Amano and Tadao Nakamura", title = "Guest editors' introduction: {ICFPT 2007}", journal = j-TRETS, volume = "2", number = "2", pages = "7:1--7:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534917", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhao:2009:TMB, author = "Weisheng Zhao and Eric Belhaire and Claude Chappert and Bernard Dieny and Guillaume Prenat", title = "{TAS-MRAM}-Based Low-Power High-Speed Runtime Reconfiguration {(RTR) FPGA}", journal = j-TRETS, volume = "2", number = "2", pages = "8:1--8:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534918", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Koch:2009:HDT, author = "Dirk Koch and Christian Beckhoff and J{\"u}rgen Teich", title = "Hardware Decompression Techniques for {FPGA}-Based Embedded Systems", journal = j-TRETS, volume = "2", number = "2", pages = "9:1--9:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534919", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wong:2009:SMC, author = "Justin S. J. Wong and Pete Sedcole and Peter Y. K. Cheung", title = "Self-Measurement of Combinatorial Circuit Delays in {FPGAs}", journal = j-TRETS, volume = "2", number = "2", pages = "10:1--10:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534920", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Seetharaman:2009:ASF, author = "G. Seetharaman and B. Venkataramani", title = "Automation Schemes for {FPGA} Implementation of Wave-Pipelined Circuits", journal = j-TRETS, volume = "2", number = "2", pages = "11:1--11:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534921", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yu:2009:VPS, author = "Jason Yu and Christopher Eagleston and Christopher Han-Yu Chou and Maxime Perreault and Guy Lemieux", title = "Vector Processing as a Soft Processor Accelerator", journal = j-TRETS, volume = "2", number = "2", pages = "12:1--12:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534922", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cevrero:2009:FPC, author = "Alessandro Cevrero and Panagiotis Athanasopoulos and Hadi Parandeh-Afshar and Ajay K. Verma and Hosein Seyed Attarzadeh Niaki and Chrysostomos Nicopoulos and Frank K. Gurkaynak and Philip Brisk and Yusuf Leblebici and Paolo Ienne", title = "Field Programmable Compressor Trees: Acceleration of Multi-Input Addition on {FPGAs}", journal = j-TRETS, volume = "2", number = "2", pages = "13:1--13:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534923", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jang:2009:WFT, author = "Stephen Jang and Billy Chan and Kevin Chung and Alan Mishchenko", title = "{WireMap}: {FPGA} Technology Mapping for Improved Routability and Enhanced {LUT} Merging", journal = j-TRETS, volume = "2", number = "2", pages = "14:1--14:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534924", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chung:2009:PTS, author = "Eric S. Chung and Michael K. Papamichael and Eriko Nurvitadhi and James C. Hoe and Ken Mai and Babak Falsafi", title = "{ProtoFlex}: Towards Scalable, Full-System Multiprocessor Simulations Using {FPGAs}", journal = j-TRETS, volume = "2", number = "2", pages = "15:1--15:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1534916.1534925", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:50 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Pellauer:2009:PNP, author = "Michael Pellauer and Muralidaran Vijayaraghavan and Michael Adler and Arvind and Joel Emer", title = "{A}-Port Networks: Preserving the Timed Behavior of Synchronous Systems for Modeling on {FPGAs}", journal = j-TRETS, volume = "2", number = "3", pages = "16:1--16:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575774.1575775", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:54 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cong:2009:FBH, author = "Jason Cong and Yi Zou", title = "{FPGA}-Based Hardware Acceleration of Lithographic Aerial Image Simulation", journal = j-TRETS, volume = "2", number = "3", pages = "17:1--17:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575774.1575776", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:54 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ahmed:2009:PTV, author = "Taneem Ahmed and Paul D. Kundarewich and Jason H. Anderson", title = "Packing Techniques for {Virtex-5 FPGAs}", journal = j-TRETS, volume = "2", number = "3", pages = "18:1--18:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575774.1575777", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:54 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Parandeh-Afshar:2009:FLC, author = "Hadi Parandeh-Afshar and Philip Brisk and Paolo Ienne", title = "An {FPGA} Logic Cell and Carry Chain Configurable as a 6:2 or 7:2 Compressor", journal = j-TRETS, volume = "2", number = "3", pages = "19:1--19:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575774.1575778", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:54 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Compton:2009:ISI, author = "Katherine Compton and Roger Woods and Christos Bouganis and Pedro Diniz", title = "Introduction to the Special Issue {ARC'08}", journal = j-TRETS, volume = "2", number = "4", pages = "20:1--20:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575780", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jin:2009:ERA, author = "Qiwei Jin and David B. Thomas and Wayne Luk and Benjamin Cope", title = "Exploring Reconfigurable Architectures for Tree-Based Option Pricing Models", journal = j-TRETS, volume = "2", number = "4", pages = "21:1--21:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575781", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Angelopoulou:2009:RRT, author = "Maria E. Angelopoulou and Christos-Savvas Bouganis and Peter Y. K. Cheung and George A. Constantinides", title = "Robust Real-Time Super-Resolution on {FPGA} and an Application to Video Enhancement", journal = j-TRETS, volume = "2", number = "4", pages = "22:1--22:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575782", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lo:2009:SOC, author = "Chia-Tien Dan Lo and Yi-Gang Tai", title = "Space Optimization on Counters for {FPGA}-Based {Perl} Compatible Regular Expressions", journal = j-TRETS, volume = "2", number = "4", pages = "23:1--23:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575783", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vassiliadis:2009:ADF, author = "Nikolaos Vassiliadis and George Theodoridis and Spiridon Nikolaidis", title = "An Application Development Framework for {ARISE} Reconfigurable Processors", journal = j-TRETS, volume = "2", number = "4", pages = "24:1--24:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575784", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dragomir:2009:OLU, author = "Ozana Silvia Dragomir and Todor Stefanov and Koen Bertels", title = "Optimal Loop Unrolling and Shifting for Reconfigurable Architectures", journal = j-TRETS, volume = "2", number = "4", pages = "25:1--25:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575785", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Underwood:2009:SSL, author = "Keith D. Underwood and K. Scott Hemmert and Craig D. Ulmer", title = "From Silicon to Science: The Long Road to Production Reconfigurable Supercomputing", journal = j-TRETS, volume = "2", number = "4", pages = "26:1--26:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1575779.1575786", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:46:56 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Roldao:2010:HTF, author = "Antonio Roldao and George A. Constantinides", title = "A High Throughput {FPGA}-Based Floating Point Conjugate Gradient Implementation for Dense Matrices", journal = j-TRETS, volume = "3", number = "1", pages = "1:1--1:??", month = jan, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1661438.1661439", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:47:03 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dubois:2010:SMV, author = "David Dubois and Andrew Dubois and Thomas Boorman and Carolyn Connor and Steve Poole", title = "Sparse Matrix-Vector Multiplication on a Reconfigurable Supercomputer with Application", journal = j-TRETS, volume = "3", number = "1", pages = "2:1--2:??", month = jan, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1661438.1661440", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:47:03 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Drimer:2010:DBP, author = "Saar Drimer and Tim G{\"u}neysu and Christof Paar", title = "{DSPs}, {BRAMs}, and a Pinch of Logic: Extended Recipes for {AES} on {FPGAs}", journal = j-TRETS, volume = "3", number = "1", pages = "3:1--3:??", month = jan, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1661438.1661441", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:47:03 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Koh:2010:CMP, author = "Shannon Koh and Oliver Diessel", title = "Configuration Merging in Point-to-Point Networks for Module-Based {FPGA} Reconfiguration", journal = j-TRETS, volume = "3", number = "1", pages = "4:1--4:??", month = jan, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1661438.1661442", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:47:03 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Curreri:2010:PAF, author = "John Curreri and Seth Koehler and Alan D. George and Brian Holland and Rafael Garcia", title = "Performance Analysis Framework for High-Level Language Applications in Reconfigurable Computing", journal = j-TRETS, volume = "3", number = "1", pages = "5:1--5:??", month = jan, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1661438.1661443", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 16 09:47:03 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bodily:2010:CSI, author = "John Bodily and Brent Nelson and Zhaoyi Wei and Dah-Jye Lee and Jeff Chase", title = "A Comparison Study on Implementing Optical Flow and Digital Communications on {FPGAs} and {GPUs}", journal = j-TRETS, volume = "3", number = "2", pages = "6:1--6:??", month = may, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1754386.1754387", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 22 16:00:33 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGA devices have often found use as higher-performance alternatives to programmable processors for implementing computations. Applications successfully implemented on FPGAs typically contain high levels of parallelism and often use simple statically scheduled control and modest arithmetic. Recently introduced computing devices such as coarse-grain reconfigurable arrays, multi-core processors, and graphical processing units promise to significantly change the computational landscape and take advantage of many of the same application characteristics that fit well on FPGAs. One real-time computing task, optical flow, is difficult to apply in robotic vision applications because of its high computational and data rate requirements, and so is a good candidate for implementation on FPGAs and other custom computing architectures. This article reports on a series of experiments mapping a collection of different algorithms onto both an FPGA and a GPU. For two different optical flow algorithms the GPU had better performance, while for a set of digital comm MIMO computations, they had similar performance. In all cases the FPGA implementations required 10x the development time. Finally, a discussion of the two technology's characteristics is given to show they achieve high performance in different ways.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Digital communications; FPGA; GPU; optical flow; reconfigurable computing", } @Article{Papadopoulos:2010:TRM, author = "Konstantinos Papadopoulos and Ioannis Papaefstathiou", title = "{Titan-R}: a Multigigabit Reconfigurable Combined Compression\slash Decompression Unit", journal = j-TRETS, volume = "3", number = "2", pages = "7:1--7:??", month = may, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1754386.1754388", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 22 16:00:33 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Data compression techniques can alleviate bandwidth problems in even multigigabit networks and are especially useful when combined with encryption. This article demonstrates a reconfigurable hardware compressor/decompressor core, the Titan-R, which can compress/decompress data streams at 8.5 Gb/sec, making it the fastest reconfigurable such device ever proposed; the presented full-duplex implementation allows for fully symmetric compression and decompression rates at 8.5 Gbps each. Its compression algorithm is a variation of the most widely used and efficient such scheme, the Lempel--Ziv (LZ) algorithm that uses part of the previous input stream as the dictionary. In order to support this high network throughput, the Titan-R utilizes a very fine-grained pipeline and takes advantage of the high bandwidth provided by the distributed on-chip RAMs of state-of-the-art FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "data compression; FPGA; hardware algorithms; networking; parallel processing; reconfigurable computing; Stream processing", } @Article{Badrignans:2010:SSA, author = "Beno{\^\i}t Badrignans and David Champagne and Reouven Elbaz and Catherine Gebotys and Lionel Torres", title = "{SARFUM}: Security Architecture for Remote {FPGA} Update and Monitoring", journal = j-TRETS, volume = "3", number = "2", pages = "8:1--8:??", month = may, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1754386.1754389", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 22 16:00:33 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Remote update of hardware platforms or embedded systems is a convenient service enabled by Field Programmable Gate Array (FPGA)-based systems. This service is often essential in applications like space-based FPGA systems or set-top boxes. However, having the source of the update be remote from the FPGA system opens the door to a set of attacks that may challenge the confidentiality and integrity of the FPGA configuration, the bitstream. Existing schemes propose to encrypt and authenticate the bitstream to thwart these attacks. However, we show that they do not prevent the replay of old bitstream versions, and thus give adversaries an opportunity for downgrading the system. In this article, we propose a new architecture called\par sarfum that, in addition to ensuring bitstream confidentiality and integrity, precludes the replay of old bitstreams. sarfum also includes a protocol for the system designer to remotely monitor the running configuration of the FPGA. Following our presentation and analysis of the security protocols, we propose an example of implementation with the CCM (Counter with CBC-MAC) authenticated encryption standard. We also evaluate the impact of our architecture on the configuration time for different FPGA devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "authenticated encryption; bitstream security; FPGA; replay attack; security protocol; system downgrade", } @Article{Yoo:2010:IRR, author = "Sang-Kyung Yoo and Deniz Karakoyunlu and Berk Birand and Berk Sunar", title = "Improving the Robustness of Ring Oscillator {TRNGs}", journal = j-TRETS, volume = "3", number = "2", pages = "9:1--9:??", month = may, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1754386.1754390", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 22 16:00:33 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A ring oscillator-based true-random number generator design (Rings design) was introduced in Sunar et al. [2007]. The design was rigorously analyzed under a simple mathematical model and its performance characteristics were established. In this article we focus on the practical aspects of the Rings design on a reconfigurable logic platform and determine their implications on the earlier analysis framework. We make recommendations for avoiding pitfalls in real-life implementations by considering ring interaction, transistor-level effects, narrow signal rejection, transmission line attenuation, and sampler bias. Furthermore, we present experimental results showing that changing operating conditions such as the power supply voltage or the operating temperature may affect the output quality when the signal is subsampled. Hence, an attacker may shift the operating point via a simple noninvasive influence and easily bias the TRNG output. Finally, we propose modifications to the design which significantly improve its robustness against attacks, alleviate implementation-related problems, and simultaneously improve its area, throughput, and power performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "cryptography; Oscillator rings; true random number generators", } @Article{Huffmire:2010:SPR, author = "Ted Huffmire and Timothy Levin and Thuy Nguyen and Cynthia Irvine and Brett Brotherton and Gang Wang and Timothy Sherwood and Ryan Kastner", title = "Security Primitives for Reconfigurable Hardware-Based Systems", journal = j-TRETS, volume = "3", number = "2", pages = "10:1--10:??", month = may, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1754386.1754391", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 22 16:00:33 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Computing systems designed using reconfigurable hardware are increasingly composed using a number of different Intellectual Property (IP) cores, which are often provided by third-party vendors that may have different levels of trust. Unlike traditional software where hardware resources are mediated using an operating system, IP cores have fine-grain control over the underlying reconfigurable hardware. To address this problem, the embedded systems community requires novel security primitives that address the realities of modern reconfigurable hardware. In this work, we propose security primitives using ideas centered around the notion of ``moats and drawbridges.'' The primitives encompass four design properties: logical isolation, interconnect traceability, secure reconfigurable broadcast, and configuration scrubbing. Each of these is a fundamental operation with easily understood formal properties, yet they map cleanly and efficiently to a wide variety of reconfigurable devices. We carefully quantify the required overheads of the security techniques on modern FPGA architectures across a number of different applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Advanced Encryption Standard (AES); controlled sharing; enforcement mechanisms; execution monitors; Field Programmable Gate Arrays (FPGAs); hardware security; isolation; memory protection; reference monitors; security policies; security primitives; separation; static analysis; Systems-on-a-Chip (SoCs)", } @Article{Hemmert:2010:FEF, author = "K. Scott Hemmert and Keith D. Underwood", title = "Fast, Efficient Floating-Point Adders and Multipliers for {FPGAs}", journal = j-TRETS, volume = "3", number = "3", pages = "11:1--11:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839481", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Floating-point applications are a growing trend in the FPGA community. As such, it has become critical to create floating-point units optimized for standard FPGA technology. Unfortunately, the FPGA design space is very different from the VLSI design space; thus, optimizations for FPGAs can differ significantly from optimizations for VLSI. In particular, the FPGA environment constrains the design space such that only limited parallelism can be effectively exploited to reduce latency. Obtaining the right balances between clock speed, latency, and area in FPGAs can be particularly challenging. This article presents implementation details for an IEEE-754 standard floating-point adder and multiplier for FPGAs. The designs presented here enable a Xilinx Virtex4 FPGA (-11 speed grade) to achieve 270 MHz IEEE compliant double precision floating-point performance with a 9-stage adder pipeline and 14-stage multiplier pipeline. The area requirement is approximately 500 slices for the adder and under 750 slices for the multiplier.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "floating point; FPGA; HPC; reconfigurable computing", } @Article{Sghaier:2010:IAT, author = "Ahmad Sghaier and Shawki Areibi and Robert Dony", title = "Implementation Approaches Trade-Offs for {WiMax OFDM} Functions on Reconfigurable Platforms", journal = j-TRETS, volume = "3", number = "3", pages = "12:1--12:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839482", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This work investigates several approaches for implementing the OFDM functions of the fixed-WiMax standard on reconfigurable platforms. In the first phase, a custom RTL approach, using VHDL, is investigated. The approach shows the capability of a medium-size FPGA to accommodate the OFDM functions of a fixed-WiMax transceiver with only 50\% occupation rate. In the second phase, a high-level approach based on the AccelDSP tool is used and compared to the custom RTL approach. The approach presents an easy flow to transfer MATLAB floating-point code into synthesizable cores. The AccelDSP approach shows an area overhead of 10\%, while allowing early architectural exploration and accelerating the design time by a factor of two. However, the performance figure obtained is almost 1/4 of that obtained in the custom RTL approach. In the third phase, the Tensilica Xtensa configurable processor is targeted, which presents remarkable figures in terms of power, area, and design time. Comparing the three approaches indicates that the custom RTL approach has the lead in terms of performance. However, both the AccelDSP and the Tensilica Xtensa approaches show fast design time and early architectural exploration capability. In terms of power, the obtained estimation results show that the configurable Xtensa processor approach has the lead, where approximately the total power consumed is about 12--15 times less than those results obtained by the other two approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "AccelDSP; ASIP; custom RTL; FPGA; Tensilica; WiMax", } @Article{Smith:2010:AFA, author = "Alastair M. Smith and George A. Constantinides and Peter Y. K. Cheung", title = "An Automated Flow for Arithmetic Component Generation in Field-Programmable Gate Arrays", journal = j-TRETS, volume = "3", number = "3", pages = "13:1--13:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839483", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "State-of-the-art configurable logic platforms, such as Field-Programmable Gate Arrays (FPGAs), consist of a heterogeneous mixture of different component types. Compared to traditional homogeneous configurable platforms, heterogeneity provides speed and density advantages. This is due to the replacement of inefficient programmable logic and routing with specialized logic and fixed interconnect in components such as memories, embedded processor units, and fused arithmetic units. Given the increasing complexity of these components, this article introduces a method to automatically propose and explore the benefits of different types of fused arithmetic units. The methods are based on common subgraph extraction techniques, meaning that it is possible to explore different subcircuits that occur frequently across a set of benchmarks. A quantitative analysis is performed of the various fused arithmetic circuits identified by our tool, which are then automatically synthesized to an ASIC process, providing a study of the speed and area benefits of the components. The results of this study provide bounds on the performance of heterogeneous FPGAs: by incorporating coarse-grain components which match the specific needs of a set of benchmarks we show that significant improvements in circuit speed and area can be made.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "common subgraph; FPGA; reconfigurable logic", } @Article{Moscola:2010:HAR, author = "James Moscola and Ron K. Cytron and Young H. Cho", title = "Hardware-Accelerated {RNA} Secondary-Structure Alignment", journal = j-TRETS, volume = "3", number = "3", pages = "14:1--14:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839484", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The search for homologous RNA molecules---sequences of RNA that might behave similarly due to similarity in their physical (secondary) structure---is currently a computationally intensive task. Moreover, RNA sequences are populating genome databases at a pace unmatched by gains in standard processor performance. While software tools such as Infernal can efficiently find homologies among RNA families and genome databases of modest size, the continuous advent of new RNA families and the explosive growth in volume of RNA sequences necessitate a faster approach.\par This work introduces two different architectures for accelerating the task of finding homologous RNA molecules in a genome database. The first architecture takes advantage of the tree-like configuration of the covariance models used to represent the consensus secondary structure of an RNA family and converts it directly into a highly-pipelined processing engine. Results for this architecture show a 24$ \times $ speedup over Infernal when processing a small RNA model. It is estimated that the architecture could potentially offer several thousands of times speedup over Infernal on larger models, provided that there are sufficient hardware resources available.\par The second architecture is introduced to address the steep resource requirements of the first architecture. It utilizes a uniform array of processing elements and schedules all of the computations required to scan for an RNA homolog onto those processing elements. The estimated speedup for this architecture over the Infernal software package ranges from just under 20$ \times $ to over 2,350$ \times $.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "Bioinformatics; RNA; secondary-structure alignment", } @Article{Ben-Asher:2010:RMC, author = "Yosi Ben-Asher and Danny Meisler and Nadav Rotem", title = "Reducing Memory Constraints in Modulo Scheduling Synthesis for {FPGAs}", journal = j-TRETS, volume = "3", number = "3", pages = "15:1--15:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839485", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In High-Level Synthesis (HLS), extracting parallelism in order to create small and fast circuits is the main advantage of HLS over software execution. Modulo Scheduling (MS) is a technique in which a loop is parallelized by overlapping different parts of successive iterations. This ability to extract parallelism makes MS an attractive synthesis technique for loop acceleration. In this work we consider two problems involved in the use of MS which are central when targeting FPGAs. Current MS scheduling techniques sacrifice execution times in order to meet resource and delay constraints. Let ``ideal'' execution times be the ones that could have been obtained by MS had we ignored resource and delay constraints. Here we pose the opposite problem, which is more suitable for HLS, namely, how to reduce resource constraints without sacrificing the ideal execution time. We focus on reducing the number of memory ports used by the MS synthesis, which we believe is a crucial resource for HLS. In addition to reducing the number of memory ports we consider the need to develop MS techniques that are fast enough to allow interactive synthesis times and repeated applications of the MS to explore different possibilities of synthesizing the circuits. Current solutions for MS synthesis that can handle memory constraints are too slow to support interactive synthesis. We formalize the problem of reducing the number of parallel memory references in every row of the kernel by a novel combinatorial setting. The proposed technique is based on inserting dummy operations in the kernel and by doing so, performing modulo-shift operations such that the maximal number of parallel memory references in a row is reduced. Experimental results suggest improved execution times for the synthesized circuit. The synthesis takes only a few seconds even for large-size loops.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "FPGA; high-level synthesis; memory optimizations; modulo-scheduling", } @Article{Wang:2010:VVP, author = "Xiaojun Wang and Miriam Leeser", title = "{VFloat}: a Variable Precision Fixed- and Floating-Point Library for Reconfigurable Hardware", journal = j-TRETS, volume = "3", number = "3", pages = "16:1--16:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839486", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Optimal reconfigurable hardware implementations may require the use of arbitrary floating-point formats that do not necessarily conform to IEEE specified sizes. We present a variable precision floating-point library (VFloat) that supports general floating-point formats including IEEE standard formats. Most previously published floating-point formats for use with reconfigurable hardware are subsets of our format. Custom datapaths with optimal bitwidths for each operation can be built using the variable precision hardware modules in the VFloat library, enabling a higher level of parallelism. The VFloat library includes three types of hardware modules for format control, arithmetic operations, and conversions between fixed-point and floating-point formats. The format conversions allow for hybrid fixed- and floating-point operations in a single design. This gives the designer control over a large number of design possibilities including format as well as number range within the same application. In this article, we give an overview of the components in the VFloat library and demonstrate their use in an implementation of the K-means clustering algorithm applied to multispectral satellite images.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "clustering; floating-point; Reconfigurable hardware", } @Article{Purnaprajna:2010:RRM, author = "Madhura Purnaprajna and Mario Porrmann and Ulrich Rueckert and Michael Hussmann and Michael Thies and Uwe Kastens", title = "Runtime Reconfiguration of Multiprocessors Based on Compile-Time Analysis", journal = j-TRETS, volume = "3", number = "3", pages = "17:1--17:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839487", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In multiprocessors, performance improvement is typically achieved by exploring parallelism with fixed granularities, such as instruction-level, task-level, or data-level parallelism. We introduce a new reconfiguration mechanism that facilitates variations in these granularities in order to optimize resource utilization in addition to performance improvements. Our reconfigurable multiprocessor QuadroCore combines the advantages of reconfigurability and parallel processing. In this article, a unified hardware-software approach for the design of our QuadroCore is presented. This design flow is enabled via compiler-driven reconfiguration which matches application-specific characteristics to a fixed set of architectural variations. A special reconfiguration mechanism has been developed that alters the architecture within a single clock cycle.\par The QuadroCore has been implemented on Xilinx XC2V6000 for functional validation and on UMC's 90nm standard cell technology for performance estimation. A diverse set of applications have been mapped onto the reconfigurable multiprocessor to meet orthogonal performance characteristics in terms of time and power. Speedup measurements show a 2--11 times performance increase in comparison to a single processor. Additionally, the reconfiguration scheme has been applied to save power in data-parallel applications. Gate-level simulations have been performed to measure the power-performance trade-offs for two computationally complex applications. The power reports confirm that introducing this scheme of reconfiguration results in power savings in the range of 15--24\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "compilation for multiprocessors; Reconfigurable multiprocessors", } @Article{Banerjee:2010:BMA, author = "Sudarshan Banerjee and Elaheh Bozorgzadeh and Juanjo Noguera and Nikil Dutt", title = "Bandwidth Management in Application Mapping for Dynamically Reconfigurable Architectures", journal = j-TRETS, volume = "3", number = "3", pages = "18:1--18:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839480.1839488", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 8 18:26:34 MDT 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Partial dynamic reconfiguration (often referred to as partial RTR) enables true on-demand computing. In an on-demand computing environment, a dynamically invoked application is assigned resources such as data bandwidth, configurable logic. The limited logic resources are customized during application execution by exploiting partial RTR. In this article, we propose an approach that maximizes application performance when available bandwidth and logic resources are limited. Our proposed approach is based on theoretical principles of minimizing application schedule length under bandwidth and logic resource constraints. It includes detailed microarchitectural considerations on a commercially popular reconfigurable device, and it exploits partial RTR very effectively by utilizing data-parallelism property of common image-processing applications. We present extensive application case studies on a cycle-accurate simulation platform that includes detailed resource considerations of the Xilinx Virtex XC2V3000. Our experimental results demonstrate that applying our proposed approach to common image-filtering applications leads to 15--20\% performance gain in scenarios with limited bandwidth, when compared to prior work that also exploits data-parallelism with RTR but includes simpler bandwidth considerations. Last but not the least, we also demonstrate how our proposed theoretical principles can be directly applied to solve related problems such as minimizing schedule length under logic resource and power constraints.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", keywords = "bandwidth; Partial RTR; scheduling", } @Article{Williams:2010:CFR, author = "Jason Williams and Chris Massie and Alan D. George and Justin Richardson and Kunal Gosrani and Herman Lam", title = "Characterization of Fixed and Reconfigurable Multi-Core Devices for Application Acceleration", journal = j-TRETS, volume = "3", number = "4", pages = "19:1--19:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862649", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Huang:2010:RCA, author = "Miaoqing Huang and Vikram K. Narayana and Harald Simmler and Olivier Serres and Tarek El-Ghazawi", title = "Reconfiguration and Communication-Aware Task Scheduling for High-Performance Reconfigurable Computing", journal = j-TRETS, volume = "3", number = "4", pages = "20:1--20:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862650", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sano:2010:FAB, author = "Kentaro Sano and Wang Luzhou and Yoshiaki Hatsuda and Takanori Iizuka and Satoru Yamamoto", title = "{FPGA}-Array with Bandwidth-Reduction Mechanism for Scalable and Power-Efficient Numerical Simulations Based on Finite Difference Methods", journal = j-TRETS, volume = "3", number = "4", pages = "21:1--21:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862651", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Saldana:2010:MPM, author = "Manuel Salda{\~n}a and Arun Patel and Christopher Madill and Daniel Nunes and Danyao Wang and Paul Chow and Ralph Wittig and Henry Styles and Andrew Putnam", title = "{MPI} as a Programming Model for High-Performance Reconfigurable Computers", journal = j-TRETS, volume = "3", number = "4", pages = "22:1--22:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862652", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chiu:2010:MDS, author = "Matt Chiu and Martin C. Herbordt", title = "Molecular Dynamics Simulations on High-Performance Reconfigurable Computing Systems", journal = j-TRETS, volume = "3", number = "4", pages = "23:1--23:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862653", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Montone:2010:PFD, author = "Alessio Montone and Marco D. Santambrogio and Donatella Sciuto and Seda Ogrenci Memik", title = "Placement and Floorplanning in Dynamically Reconfigurable {FPGAs}", journal = j-TRETS, volume = "3", number = "4", pages = "24:1--24:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862654", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Reardon:2010:SFR, author = "Casey Reardon and Eric Grobelny and Alan D. George and Gongyu Wang", title = "A Simulation Framework for Rapid Analysis of Reconfigurable Computing Systems", journal = j-TRETS, volume = "3", number = "4", pages = "25:1--25:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862655", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tian:2010:HPQ, author = "Xiang Tian and Khaled Benkrid", title = "High-Performance Quasi-{Monte Carlo} Financial Simulation: {FPGA} vs. {GPP} vs. {GPU}", journal = j-TRETS, volume = "3", number = "4", pages = "26:1--26:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1862648.1862656", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 23 11:26:33 MST 2010", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Woods:2010:GEA, author = "Roger Woods and J{\"u}rgen Becker and Peter Athanas and Fearghal Morgan", title = "Guest Editorial {ARC 2009}", journal = j-TRETS, volume = "4", number = "1", pages = "1:1--1:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857928", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Saiprasert:2010:OHA, author = "Chalermpol Saiprasert and Christos-S. Bouganis and George A. Constantinides", title = "An Optimized Hardware Architecture of a Multivariate {Gaussian} Random Number Generator", journal = j-TRETS, volume = "4", number = "1", pages = "2:1--2:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857929", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Monte Carlo simulation is one of the most widely used techniques for computationally intensive simulations in mathematical analysis and modeling. A multivariate Gaussian random number generator is one of the main building blocks of such a system. Field Programmable Gate Arrays (FPGAs) are gaining increased popularity as an alternative means to the traditional general purpose processors targeting the acceleration of the computationally expensive random number generator block. This article presents a novel approach for mapping a multivariate Gaussian random number generator onto an FPGA by optimizing the computational path in terms of hardware resource usage subject to an acceptable error in the approximation of the distribution of interest. The proposed approach is based on the eigenvalue decomposition algorithm which leads to a design with different precision requirements in the computational paths. An analysis on the impact of the error due to truncation/rounding operation along the computational path is performed and an analytical expression of the error inserted into the system is presented. Based on the error analysis, three algorithms that optimize the resource utilization and at the same time minimize the error in the output of the system are presented and compared. Experimental results reveal that the hardware resource usage on an FPGA as well as the error in the approximation of the distribution of interest are significantly reduced by the use of the optimization techniques introduced in the proposed approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kahoul:2010:EHA, author = "Asma Kahoul and Alastair M. Smith and George A. Constantinides and Peter Y. K. Cheung", title = "Efficient Heterogeneous Architecture Floorplan Optimization using Analytical Methods", journal = j-TRETS, volume = "4", number = "1", pages = "3:1--3:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857930", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kepa:2010:DAS, author = "K. Kepa and F. Morgan and K. Ko{\'s}ciuszkiewicz and L. Braun and M. H{\"u}bner and J. Becker", title = "Design Assurance Strategy and Toolset for Partially Reconfigurable {FPGA} Systems", journal = j-TRETS, volume = "4", number = "1", pages = "4:1--4:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857931", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Inoue:2010:VGL, author = "Kazuki Inoue and Qian Zhao and Yasuhiro Okamoto and Hiroki Yosho and Motoki Amagasaki and Masahiro Iida and Toshinori Sueyoshi", title = "A Variable-Grain Logic Cell and Routing Architecture for a Reconfigurable {IP} Core", journal = j-TRETS, volume = "4", number = "1", pages = "5:1--5:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857932", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Guo:2010:OSC, author = "Xu Guo and Patrick Schaumont", title = "Optimized System-on-Chip Integration of a Programmable {ECC} Coprocessor", journal = j-TRETS, volume = "4", number = "1", pages = "6:1--6:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857933", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sterpone:2010:NTD, author = "Luca Sterpone", title = "A New Timing Driven Placement Algorithm for Dependable Circuits on {SRAM}-based {FPGAs}", journal = j-TRETS, volume = "4", number = "1", pages = "7:1--7:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857934", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lanuzza:2010:ESR, author = "M. Lanuzza and P. Zicari and F. Frustaci and S. Perri and P. Corsonello", title = "Exploiting Self-Reconfiguration Capability to Improve {SRAM}-based {FPGA} Robustness in Space and Avionics Applications", journal = j-TRETS, volume = "4", number = "1", pages = "8:1--8:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857935", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hsiung:2010:SPH, author = "Pao-Ann Hsiung and Chun-Hsian Huang and Jih-Sheng Shen and Chen-Chi Chiang", title = "Scheduling and Placement of Hardware\slash Software Real-Time Relocatable Tasks in Dynamically Partially Reconfigurable Systems", journal = j-TRETS, volume = "4", number = "1", pages = "9:1--9:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857936", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kanazawa:2010:ASL, author = "Kenji Kanazawa and Tsutomu Maruyama", title = "An Approach for Solving Large {SAT} Problems on {FPGA}", journal = j-TRETS, volume = "4", number = "1", pages = "10:1--10:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857937", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2010:ERD, author = "Yingxi Lu and Maire O'Neill and John McCanny", title = "Evaluation of Random Delay Insertion against {DPA} on {FPGAs}", journal = j-TRETS, volume = "4", number = "1", pages = "11:1--11:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1857927.1857938", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Jan 26 14:58:50 MST 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bergeron:2011:LTF, author = "Etienne Bergeron and Louis-David Perron and Marc Feeley and Jean Pierre David", title = "Logarithmic-Time {FPGA} Bitstream Analysis: a Step Towards {JIT} Hardware Compilation", journal = j-TRETS, volume = "4", number = "2", pages = "12:1--12:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968503", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vaidya:2011:NMC, author = "Pranav Vaidya and Jaehwan John Lee", title = "A Novel Multicontext Coarse-Grained Reconfigurable Architecture {(CGRA)} For Accelerating Column-Oriented Databases", journal = j-TRETS, volume = "4", number = "2", pages = "13:1--13:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968504", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{ONeill:2011:SPM, author = "Shane O'Neill and Roger Francis Woods and Alan James Marshall and Qi Zhang", title = "A Scalable and Programmable Modular Traffic Manager Architecture", journal = j-TRETS, volume = "4", number = "2", pages = "14:1--14:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968505", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nakajima:2011:FOR, author = "Mao Nakajima and Minoru Watanabe", title = "Fast Optical Reconfiguration of a Nine-Context {DORGA} Using a Speed Adjustment Control", journal = j-TRETS, volume = "4", number = "2", pages = "15:1--15:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968506", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tai:2011:POA, author = "Tzu-Chiang Tai and Yen-Tai Lai", title = "A Performance-Oriented Algorithm with Consideration on Communication Cost for Dynamically Reconfigurable {FPGA} Partitioning", journal = j-TRETS, volume = "4", number = "2", pages = "16:1--16:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968507", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Demertzi:2011:DSO, author = "Melina Demertzi and Pedro C. Diniz and Mary W. Hall and Anna C. Gilbert and Yi Wang", title = "Domain-Specific Optimization of Signal Recognition Targeting {FPGAs}", journal = j-TRETS, volume = "4", number = "2", pages = "17:1--17:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968508", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Galuzzi:2011:ISE, author = "Carlo Galuzzi and Koen Bertels", title = "The Instruction-Set Extension Problem: a Survey", journal = j-TRETS, volume = "4", number = "2", pages = "18:1--18:28", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968509", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rupnow:2011:SAD, author = "Kyle Rupnow and Keith D. Underwood and Katherine Compton", title = "Scientific Application Demands on a Reconfigurable Functional Unit Interface", journal = j-TRETS, volume = "4", number = "2", pages = "19:1--19:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968510", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kaganov:2011:FAM, author = "Alexander Kaganov and Asif Lakhany and Paul Chow", title = "{FPGA} Acceleration of {MultiFactor CDO} Pricing", journal = j-TRETS, volume = "4", number = "2", pages = "20:1--20:??", month = may, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1968502.1968511", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 7 18:34:54 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Labrecque:2011:ASS, author = "Martin Labrecque and Mark C. Jeffrey and J. Gregory Steffan", title = "Application-specific signatures for transactional memory in soft processors", journal = j-TRETS, volume = "4", number = "3", pages = "21:1--21:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000833", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Boland:2011:OMB, author = "David Boland and George A. Constantinides", title = "Optimizing memory bandwidth use and performance for matrix-vector multiplication in iterative methods", journal = j-TRETS, volume = "4", number = "3", pages = "22:1--22:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000834", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Glaser:2011:TFT, author = "Johann Glaser and Markus Damm and Jan Haase and Christoph Grimm", title = "{TR-FSM}: Transition-Based reconfigurable finite state machine", journal = j-TRETS, volume = "4", number = "3", pages = "23:1--23:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000835", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Parvez:2011:ASF, author = "Husain Parvez and Zied Marrakchi and Alp Kilic and Habib Mehrez", title = "Application-Specific {FPGA} using heterogeneous logic blocks", journal = j-TRETS, volume = "4", number = "3", pages = "24:1--24:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000836", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yan:2011:FBA, author = "Jing Yan and Ning-Yi Xu and Xiong-Fei Cai and Rui Gao and Yu Wang and Rong Luo and Feng-Hsiung Hsu", title = "An {FPGA}-based accelerator for {LambdaRank} in {Web} search engines", journal = j-TRETS, volume = "4", number = "3", pages = "25:1--25:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000837", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In modern Web search engines, Neural Network (NN)-based learning to rank algorithms is intensively used to increase the quality of search results. LambdaRank is one such algorithm. However, it is hard to be efficiently accelerated by computer clusters or GPUs, because: (i) the cost function for the ranking problem is much more complex than that of traditional Back-Propagation(BP) NNs, and (ii) no coarse-grained parallelism exists in the algorithm. This article presents an FPGA-based accelerator solution to provide high computing performance with low power consumption. A compact deep pipeline is proposed to handle the complex computing in the batch updating. The area scales linearly with the number of hidden nodes in the algorithm. We also carefully design a data format to enable streaming consumption of the training data from the host computer. The accelerator shows up to 15.3X (with PCIe x4) and 23.9X (with PCIe x8) speedup compared with the pure software implementation on datasets from a commercial search engine.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Aggarwal:2011:SMP, author = "Vikas Aggarwal and Alan D. George and Changil Yoon and Kishore Yalamanchili and Herman Lam", title = "{SHMEM+}: a multilevel-{PGAS} programming model for reconfigurable supercomputing", journal = j-TRETS, volume = "4", number = "3", pages = "26:1--26:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000838", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Holland:2011:AMM, author = "Brian Holland and Alan D. George and Herman Lam and Melissa C. Smith", title = "An analytical model for multilevel performance prediction of Multi-{FPGA} systems", journal = j-TRETS, volume = "4", number = "3", pages = "27:1--27:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000839", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shannon:2011:LRH, author = "Lesley Shannon and Paul Chow", title = "Leveraging reconfigurability in the hardware\slash software codesign process", journal = j-TRETS, volume = "4", number = "3", pages = "28:1--28:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000840", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nava:2011:ADR, author = "Federico Nava and Donatella Sciuto and Marco Domenico Santambrogio and Stefan Herbrechtsmeier and Mario Porrmann and Ulf Witkowski and Ulrich Rueckert", title = "Applying dynamic reconfiguration in the mobile robotics domain: a case study on computer vision algorithms", journal = j-TRETS, volume = "4", number = "3", pages = "29:1--29:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000841", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Koehler:2011:PAB, author = "Seth Koehler and Greg Stitt and Alan D. George", title = "Platform-aware bottleneck detection for reconfigurable computing applications", journal = j-TRETS, volume = "4", number = "3", pages = "30:1--30:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000832.2000842", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Aug 30 08:13:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cheung:2011:ISS, author = "Peter Y. K. Cheung", title = "Introduction to special section {FPGA 2009}", journal = j-TRETS, volume = "4", number = "4", pages = "31:1--31:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068717", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Luu:2011:VFC, author = "Jason Luu and Ian Kuon and Peter Jamieson and Ted Campbell and Andy Ye and Wei Mark Fang and Kenneth Kent and Jonathan Rose", title = "{VPR 5.0}: {FPGA CAD} and architecture exploration tools with single-driver routing, heterogeneity and process scaling", journal = j-TRETS, volume = "4", number = "4", pages = "32:1--32:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068718", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The VPR toolset has been widely used in FPGA architecture and CAD research, but has not evolved over the past decade. This article describes and illustrates the use of a new version of the toolset that includes four new features: first, it supports a broad range of single-driver routing architectures, which have superior architectural and electrical properties over the prior multidriver approach (and which is now employed in the majority of FPGAs sold). Second, it can now model, for placement and routing a heterogeneous selection of hard logic blocks. This is a key (but not final) step toward the incluion of blocks such as memory and multipliers. Third, we provide optimized electrical models for a wide range of architectures in different process technologies, including a range of area-delay trade-offs for each single architecture. Finally, to maintain robustness and support future development the release includes a set of regression tests for the software. To illustrate the use of the new features, we explore several architectural issues: the FPGA area efficiency versus logic block granularity, the effect of single-driver routing, and a simple use of the heterogeneity to explore the impact of hard multipliers on wiring track count.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "32", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rubin:2011:CYO, author = "Raphael Rubin and Andr{\'e} Dehon", title = "Choose-your-own-adventure routing: Lightweight load-time defect avoidance", journal = j-TRETS, volume = "4", number = "4", pages = "33:1--33:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068719", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Aggressive scaling increases the number of devices we can integrate per square millimeter but makes it increasingly difficult to guarantee that each device fabricated has the intended operational characteristics. Without careful mitigation, component yield rates will fall, potentially negating the economic benefits of scaling. The fine-grained reconfigurability inherent in FPGAs is a powerful tool that can allow us to drop the stringent requirement that every device be fabricated perfectly in order for a component to be useful. To exploit inherent FPGA reconfigurability while avoiding full CAD mapping, we propose lightweight techniques compatible with the current single bitstream model that can avoid defective devices, reducing yield loss at high defect rates. In particular, by embedding testing operations and alternative path configurations into the bitstream, each FPGA can avoid defects by making only simple, greedy decisions at bitstream load time. With 20\% additional tracks above the minimum routable channel width, routes can tolerate 0.01\% switch and wire defect rates, raising yield from essentially 0\% to near 100\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "33", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mishchenko:2011:SDC, author = "Alan Mishchenko and Robert Brayton and Jie-Hong R. Jiang and Stephen Jang", title = "Scalable don't-care-based logic optimization and resynthesis", journal = j-TRETS, volume = "4", number = "4", pages = "34:1--34:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068720", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We describe an optimization method for combinational and sequential logic networks, with emphasis on scalability. The proposed resynthesis (a) is capable of substantial logic restructuring, (b) is customizable to solve a variety of optimization tasks, and (c) has reasonable runtime on industrial designs. The approach uses don't-cares computed for a window surrounding a node and can take into account external don't-cares (e.g., unreachable states). It uses a SAT solver for all aspects of Boolean manipulation: computing don't-cares for a node in the window, and deriving a new Boolean function of the node after resubstitution. Experimental results on 6-input LUT networks after a high effort synthesis show substantial reductions in area and delay. When applied to 20 large academic benchmarks, the LUT counts and logic levels are reduced by 45.0\% and 12.2\%, respectively. The longest runtime for synthesis and mapping is about two minutes. When applied to a set of 14 industrial benchmarks ranging up to 83K 6-LUTs, the LUT counts and logic levels are reduced by 11.8\% and 16.5\%, respectively. The longest runtime is about 30 minutes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "34", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kennings:2011:FTM, author = "Andrew Kennings and Kristofer Vorwerk and Arun Kundu and Val Pevzner and Andy Fox", title = "{FPGA} technology mapping with encoded libraries and staged priority cuts", journal = j-TRETS, volume = "4", number = "4", pages = "35:1--35:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068721", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Technology mapping is an important step in the FPGA CAD flow in which a network of simple gates is converted into a network of logic blocks. This article considers enhancements to a traditional LUT-based mapping algorithm for an FPGA comprised of logic blocks which implement only a subset of functions of up to k variables; specifically, the logic block is a partial LUT, but it possesses more inputs than a typical LUT. An analysis of the logic block is presented, and techniques for postmapping area recovery and timing-driven buffer insertion are also described. Numerical results are put forth which substantiate the efficacy of the proposed methods using real circuits mapped to a commercial FPGA architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "35", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Papadimitriou:2011:PPR, author = "Kyprianos Papadimitriou and Apostolos Dollas and Scott Hauck", title = "Performance of partial reconfiguration in {FPGA} systems: a survey and a cost model", journal = j-TRETS, volume = "4", number = "4", pages = "36:1--36:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068722", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Fine-grain reconfigurable devices suffer from the time needed to load the configuration bitstream. Even for small bitstreams in partially reconfigurable FPGAs this time cannot be neglected. In this article we survey the performance of the factors that contribute to the reconfiguration speed. Then, we study an FPGA-based system architecture and with real experiments we produce a cost model of Partial Reconfiguration (PR). This model is introduced to calculate the expected reconfiguration time and throughput. In order to develop a realistic model we take into account all the physical components that participate in the reconfiguration process. We analyze the parameters that affect the generality of the model and the adjustments needed per system for error-free evaluation. We verify it with real measurements, and then we employ it to evaluate existing systems presented in previous publications. The percentage error of the cost model when comparing its results with the actual values of those publications varies from 36\% to 63\%, whereas existing works report differences up to two orders of magnitude. Present work enables a user to evaluate PR and decide whether it is suitable for a certain application prior entering the complex PR design flow.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "36", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2011:EDL, author = "Xiaoheng Chen and Venkatesh Akella", title = "Exploiting data-level parallelism for energy-efficient implementation of {LDPC} decoders and {DCT} on an {FPGA}", journal = j-TRETS, volume = "4", number = "4", pages = "37:1--37:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068723", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We explore the use of Data-Level Parallelism (DLP) as a way of improving the energy efficiency and power consumption involved in running applications on an FPGA. We show that static power consumption is a significant fraction of the overall power consumption in an FPGA and that it does not change significantly even as the area required by an architecture increases, because of the dominance of interconnect in an FPGA. We show that the degree of DLP can be used in conjunction with frequency scaling to reduce the overall power consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "37", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Easwaran:2011:NLB, author = "Lakshmi Easwaran and Ali Akoglu", title = "Net-length-based routability-driven power-aware clustering", journal = j-TRETS, volume = "4", number = "4", pages = "38:1--38:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068724", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The state-of-the-art power-aware clustering tool, P-T-VPack, achieves energy reduction by localizing nets with high switching activity at the expense of channel width and area. In this study, we employ predicted individual postplacement net length information during clustering and prioritize longer nets. This approach targets the capacitance factor for energy reduction, and prioritizes longer nets for channel width and area reduction. We first introduce a new clustering strategy, W-T-VPack, which replaces the switching activity in P-T-VPack with a net length factor. We obtain a 9.87\% energy reduction over T-VPack (3.78\% increase over P-T-VPack), while at the same time completely eliminating P-T-VPack's channel width and area overhead. We then introduce W-P-T-VPack, which combines switching activity and net length factors. W-P-T-VPack achieves 14.26\% energy reduction (0.31\% increase over P-T-VPack), while further improving channel width by up to 12.87\% for different cluster sizes. We investigate the energy performance of routability (channel width)-driven clustering algorithms, and show that W-T-VPack consistently outperforms T-RPack and iRAC by at least 11.23\% and 9.07\%, respectively. We conclude that net-length-based clustering is an effective method to concurrently target energy and channel width.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "38", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Parandeh-Afshar:2011:CTS, author = "Hadi Parandeh-Afshar and Arkosnato Neogy and Philip Brisk and Paolo Ienne", title = "Compressor tree synthesis on commercial high-performance {FPGAs}", journal = j-TRETS, volume = "4", number = "4", pages = "39:1--39:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068725", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Compressor trees are a class of circuits that generalizes multioperand addition and the partial product reduction trees of parallel multipliers using carry-save arithmetic. Compressor trees naturally occur in many DSP applications, such as FIR filters, and, in the more general case, their use can be maximized through the application of high-level transformations to arithmetically intensive data flow graphs. Due to the presence of carry-chains, it has long been thought that trees of 2- or 3-input carry-propagate adders are more efficient than compressor trees for FPGA synthesis; however, this is not the case. This article presents a heuristic for FPGA synthesis of compressor trees that outperforms adder trees and exploits carry-chains when possible. The experimental results show that, on average, the use of compressor trees can reduce critical path delay by 33\% and 45\% respectively, compared to adder trees synthesized on the Xilinx Virtex-5 and Altera Stratix III FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "39", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Inoue:2011:TCD, author = "Hiroaki Inoue and Junya Yamada and Hideyuki Yoneda and Katsumi Togawa and Masato Motomura and Koichiro Furuta", title = "Test compression for dynamically reconfigurable processors", journal = j-TRETS, volume = "4", number = "4", pages = "40:1--40:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2068716.2068726", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Mar 16 16:20:35 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We present the world's first test compression technique that features automation of compression rules for test time reduction on dynamically reconfigurable processors. Evaluations on an actual 40-nm product show that our technique achieves a 2.7 times compression ratio for original configuration information (better than does GZIP), the peak decompression bandwidth of 1.6 GB/s, and 2.7 times shorter test times.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "40", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zick:2012:LCS, author = "Kenneth M. Zick and John P. Hayes", title = "Low-cost sensing with ring oscillator arrays for healthier reconfigurable systems", journal = j-TRETS, volume = "5", number = "1", pages = "1:1--1:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133353", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Electronic systems on a chip increasingly suffer from component variation, voltage noise, thermal hotspots, and other subtle physical phenomena. Systems with reconfigurability have unique opportunities for adapting to such effects. Required, however, are low-cost, fine-grained methods for sensing physical parameters. This article presents powerful, novel approaches to online sensing, including methods for designing compact reconfigurable sensors, low-cost threshold detection, and several enhanced measurement procedures. Together, the approaches help enable systems to autonomously uncover a wealth of physical information. A highly efficient counter and improved ring oscillator are introduced, enabling an entire sensor node in just 8 Virtex-5 LUTs. We describe how variations can be measured in delay, temperature, switching-induced IR drop, and leakage-induced IR drop. We demonstrate the proposed approach with an experimental system based on a Virtex-5, instrumented with over 100 sensors at an overhead of only 1.3\%. Results from thermally controlled experiments provide some surprising insights and illustrate the utility of the approach. Online sensing can help open the door to physically adaptive computing, including fine-grained power, reliability, and health management schemes for systems on a chip.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Michail:2012:EHT, author = "Harris E. Michail and George S. Athanasiou and Vasilis Kelefouras and George Theodoridis and Costas E. Goutis", title = "On the exploitation of a high-throughput {SHA-256 FPGA} design for {HMAC}", journal = j-TRETS, volume = "5", number = "1", pages = "2:1--2:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133354", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "High-throughput and area-efficient designs of hash functions and corresponding mechanisms for Message Authentication Codes (MACs) are in high demand due to new security protocols that have arisen and call for security services in every transmitted data packet. For instance, IPv6 incorporates the IPSec protocol for secure data transmission. However, the IPSec's performance bottleneck is the HMAC mechanism which is responsible for authenticating the transmitted data. HMAC's performance bottleneck in its turn is the underlying hash function. In this article a high-throughput and small-size SHA-256 hash function FPGA design and the corresponding HMAC FPGA design is presented. Advanced optimization techniques have been deployed leading to a SHA-256 hashing core which performs more than 30\% better, compared to the next better design. This improvement is achieved both in terms of throughput as well as in terms of throughput/area cost factor. It is the first reported SHA-256 hashing core that exceeds 11Gbps (after place and route in Xilinx Virtex 6 board).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Olivares:2012:RAV, author = "Joaqu{\'\i}n Olivares", title = "Reconfigurable architecture for {VBSME} with variable pixel precision", journal = j-TRETS, volume = "5", number = "1", pages = "3:1--3:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133355", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Current video coding standards, e.g. MPEG-4 H.264/AVC, include Variable Block Size Motion Estimation, in this paper, this process is implemented by a reconfigurable architecture based on Signed Digit arithmetic. Bit serial computation is applied to reconfigure pixel precision. The reconfigurable architectural model is extremely simple to reconfigure. Pixel truncation is used to speed up computation saving up 23.5\% of clock cycles for 4-bit precision. This design allows to process all motion vectors of a block in just one iteration. This system has been implemented in FPGA, and HDTVp results are presented. Main characteristics, of this architecture are: very reduced cost, high performance, and reconfigurable pixel precision, these features could be useful in mobile devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Siozios:2012:NFE, author = "Kostas Siozios and Vasilis F. Pavlidis and Dimitrios Soudris", title = "A novel framework for exploring {$3$-D} {FPGAs} with heterogeneous interconnect fabric", journal = j-TRETS, volume = "5", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133356", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A heterogeneous interconnect architecture can be a useful approach for the design of 3-D FPGAs. A methodology to investigate heterogeneous interconnection schemes for 3-D FPGAs under different 3-D fabrication technologies is proposed. Application of the proposed methodology on benchmark circuits demonstrates an improvement in delay, power consumption, and total wire-length of approximately 41\%, 32\%, and 36\%, respectively, as compared to 2-D FPGAs. These improvements are additional to reducing the number of interlayer connections. The fewer interlayer connections are traded off for a higher yield. An area model to evaluate this trade-off is presented. Results indicate that a heterogeneous 3-D FPGA requires 37\% less area as compared to a homogeneous 3-D FPGA. Consequently, the heterogeneous FPGAs can exhibit a higher manufacturing yield. A design toolset is also developed to support the design and exploration of various performance metrics for the proposed 3-D FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Takano:2012:DAA, author = "Shigeyuki Takano", title = "Design and analysis of adaptive processor", journal = j-TRETS, volume = "5", number = "1", pages = "5:1--5:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133357", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A new computation model called CACHE (Cache Architecture for Configurable Hardware Engine) is proposed in this paper. This model does not require a dedicated host processor and its software to harness the reconfiguration. Autonomous reconfiguration is performed within a working-set of application datapaths. The CACHE model has lots of side effects; caching, resource allocation and assignment, placement and routing, and defragmentation, with a processing array itself and a special register called a working-set register file. The model aims to reduce three major workloads: (1) the processor and application design workload, (2) runtime resource management and scheduling workload, and (3) reconfiguration workload. In order to reduce these workloads, processor architecture is definitely different from traditional computing model and its microprocessor architecture. There are three major ideas to construct the computing system: (1) an on-chip working-set model mainly in order to control load and store of streams, namely to control traffics introducing overheads, (2) an on-chip deadlock properties model mainly in order to manage resources and to continuously configure datapaths corresponding to a working-set window, (3) a cache memory technique to work for these models, the mechanism is equivalent to the working-set window, and the cache memory's procedure is equivalent to resource request, acquirement, and release of deadlock properties. The first model focuses onto streaming applications, for example vector and matrix operations, filters, and so on, which takes coarser grained operations such as integer operations of C-language. Regarding performance compared with DSPs, that comes from constant throughput across different scale of the applications. In addition, extended model, we call Instant model that automatically generates instance of a datapath, outperforms the DSPs. This paper shows its computation model, architecture, low-level design, and analyses about basic characteristics of the execution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2012:PSF, author = "Wei Zhang and Vaughn Betz and Jonathan Rose", title = "Portable and scalable {FPGA}-based acceleration of a direct linear system solver", journal = j-TRETS, volume = "5", number = "1", pages = "6:1--6:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133352.2133358", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 20 12:12:48 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGAs have the potential to serve as a platform for accelerating many computations including scientific applications. However, the large development cost and short life span for FPGA designs have limited their adoption by the scientific computing community. FPGA-based scientific computing and many kinds of embedded computing could become more practical if there were hardware libraries that were portable to any FPGA-based system with performance that scaled with the size of the FPGA. To illustrate this idea we have implemented one common super-computing library function: the LU factorization method for solving systems of linear equations. This paper describes a method for making the design both portable and scalable that should be illustrative if such libraries are to be built in the future. The design is a software-based generator that leverages both the flexibility of a software programming language and the parameters inherent in an hardware description language. The generator accepts parameters that describe the FPGA capacity and external memory capabilities. We compare the performance of our engine executing on the largest FPGA available at the time of this work (an Altera Stratix III 3S340) to a single processor core fabricated in the same 65nm IC process running a highly optimized software implementation from the processor vendor. For single precision matrices on the order of $ 10, 000 \times 10, 000 $ elements, the FPGA implementation is 2.2 times faster and the energy dissipated per useful GFLOP operation is a factor of 5 times less. For double precision, the FPGA implementation is 1.7 times faster and 3.5 times more energy efficient.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Aggarwal:2012:SFT, author = "Vikas Aggarwal and Greg Stitt and Alan George and Changil Yoon", title = "{SCF}: a Framework for Task-Level Coordination in Reconfigurable, Heterogeneous Systems", journal = j-TRETS, volume = "5", number = "2", pages = "7:1--7:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2209285.2209286", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:43 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Heterogeneous computing systems comprised of accelerators such as FPGAs, GPUs, and manycore processors coupled with standard microprocessors are becoming an increasingly popular solution for future computing systems due to their higher performance and energy efficiency. Although programming languages and tools are evolving to simplify device-level design, programming such systems is still difficult and time-consuming largely due to system-wide challenges involving communication between heterogeneous devices, which currently require ad hoc solutions. Most communication frameworks and APIs which have dominated parallel application development for decades were developed for homogeneous systems, and hence cannot be directly employed for hybrid systems. To solve this problem, this article presents the System Coordination Framework (SCF), which employs message passing to transparently enable communication between tasks described using different programming tools (and languages), and running on heterogeneous processing devices of systems from domains ranging from embedded systems to High-Performance Computing (HPC) systems. By hiding low-level architectural details of the underlying communication from an application designer, SCF can improve application development productivity, provide higher levels of application portability, and offer rapid design-space exploration of different task/device mappings. In addition, SCF enables custom communication synthesis that exploits mechanisms specific to different devices and platforms, which can provide performance improvements over generic solutions employed previously. Our results indicate a performance improvement of 28$ \times $ and 682$ \times $ by employing FPGA devices for two applications presented in this article, while simultaneously improving the developer productivity by approximately 2.5 to 5 times by using SCF.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fekete:2012:DDR, author = "S{\'a}ndor P. Fekete and Tom Kamphans and Nils Schweer and Christopher Tessars and Jan C. van der Veen and Josef Angermeier and Dirk Koch and J{\"u}rgen Teich", title = "Dynamic Defragmentation of Reconfigurable Devices", journal = j-TRETS, volume = "5", number = "2", pages = "8:1--8:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2209285.2209287", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:43 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We propose a new method for defragmenting the module layout of a reconfigurable device, enabled by a novel approach for dealing with communication needs between relocated modules and with inhomogeneities found in commonly used FPGAs. Our method is based on dynamic relocation of module positions during runtime, with only very little reconfiguration overhead; the objective is to maximize the length of contiguous free space that is available for new modules. We describe a number of algorithmic aspects of good defragmentation, and present an optimization method based on tabu search. Experimental results indicate that we can improve the quality of module layout by roughly 50\% over the static layout. Among other benefits, this improvement avoids unnecessary rejections of modules.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cheng:2012:STP, author = "Lerong Cheng and Wenyao Xu and Fang Gong and Yan Lin and Ho-Yan Wong and Lei He", title = "Statistical Timing and Power Optimization of Architecture and Device for {FPGAs}", journal = j-TRETS, volume = "5", number = "2", pages = "9:1--9:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2209285.2209288", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:43 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Process variation in nanometer technology is becoming an important issue for cutting-edge FPGAs with a multimillion gate capacity. Considering both die-to-die and within-die variations in effective channel length, threshold voltage, and gate oxide thickness, we first develop closed-form models of chip-level FPGA leakage and timing variations. Experiments show that the mean and standard deviation computed by our models are within 3\% from those computed by Monte Carlo simulation. We also observe that the leakage and timing variations can be up to 3X and 1.9X, respectively. We then derive analytical yield models considering both leakage and timing variations, and use such models to evaluate the performance of FPGA device and architecture considering process variations. Compared to the baseline, which uses the VPR architecture and device setup based on the ITRS roadmap, device and architecture tuning improves leakage yield by 10.4\%, timing yield by 5.7\%, and leakage and timing combined yield by 9.4\%. We also observe that LUT size of 4 gives the highest leakage yield, LUT size of 7 gives the highest timing yield, but LUT size of 5 achieves the maximum leakage and timing combined yield. To the best of our knowledge, this is the first in-depth study on FPGA architecture and device coevaluation considering process variation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Martin:2012:CPA, author = "Kevin Martin and Christophe Wolinski and Krzysztof Kuchcinski and Antoine Floch and Fran{\c{c}}ois Charot", title = "Constraint Programming Approach to Reconfigurable Processor Extension Generation and Application Compilation", journal = j-TRETS, volume = "5", number = "2", pages = "10:1--10:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2209285.2209289", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:43 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we present a constraint programming approach for solving hard design problems present when automatically designing specialized processor extensions. Specifically, we discuss our approach for automatic selection and synthesis of processor extensions as well as efficient application compilation for these newly generated extensions. The discussed approach is implemented in our integrated design framework, IFPEC, built using Constraint Programming (CP). In our framework, custom instructions, implemented as processor extensions, are defined as computational patterns and represented as graphs. This, along with the graph representation of an application, provides a way to use our CP framework equipped with subgraph isomorphism and connected component constraints for identification of processor extensions as well as their selection, application scheduling, binding, and routing. All design steps assume architectures composed of runtime reconfigurable cells, implementing selected extensions, tightly connected to a processor. An advantage of our approach is the possibility of combining different heterogeneous constraints to represent and solve all our design problems. Moreover, the flexibility and expressiveness of the CP framework makes it possible to solve simultaneously extension selection, application scheduling, and binding and improve the quality of the generated results. The article is largely illustrated with experimental results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hubner:2012:ISI, author = "Michael H{\"u}bner", title = "Introduction to the Special Issue on {ReCoSoC 2011}", journal = j-TRETS, volume = "5", number = "3", pages = "11:1--11:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362375", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shield:2012:ACC, author = "John Shield and Jean-Philippe Diguet and Guy Gogniat", title = "Asymmetric Cache Coherency: Policy Modifications to Improve Multicore Performance", journal = j-TRETS, volume = "5", number = "3", pages = "12:1--12:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362376", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Asymmetric coherency is a new optimization method for coherency policies to support nonuniform workloads in multicore processors. Asymmetric coherency assists in load balancing a workload and this is applicable to SoC multicores where the applications are not evenly spread among the processors and customization of the coherency is possible. Asymmetric coherency is a policy change, and consequently our designs require little or no additional hardware over an existing system. We explore two different types of asymmetric coherency policies. Our bus-based asymmetric coherency policy, generated a 60\% coherency cost reduction (reduction of latencies due to coherency messages) for nonshared data. Our directory-based asymmetric coherency policy, showed up to a 5.8\% execution time improvement and up to a 22\% improvement in average memory latency for the parallel benchmarks Sha, using a statically allocated asymmetry. Dynamically allocated asymmetry was found to generate further improvements in access latency, increasing the effectiveness of asymmetric coherency by up to 73.8\% when compared to the static asymmetric solution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Thielmann:2012:MLH, author = "Benjamin Thielmann and Jens Huthmann and Andreas Koch", title = "Memory Latency Hiding by Load Value Speculation for Reconfigurable Computers", journal = j-TRETS, volume = "5", number = "3", pages = "13:1--13:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362377", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Load value speculation has long been proposed as a method to hide the latency of memory accesses. It has seen very limited use in actual processors, often due to the high overhead of reexecuting misspeculated computations. We present PreCoRe, a framework capable of generating application-specific microarchitectures supporting load value speculation on reconfigurable computers. The article examines the lightweight speculation and replay mechanisms, the architecture of the actual data value prediction units as well as the impact on the nonspeculative parts of the memory system. In experiments, using PreCoRe has achieved speedups of up to 2.48 times over nonspeculative implementations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gantel:2012:ERP, author = "Laurent Gantel and Amel Khiar and Benoit Miramond and Mohamed El Amine Benkhelifa and Lounis Kessal and Fabrice Lemonnier and Jimmy Le Rhun", title = "Enhancing Reconfigurable Platforms Programmability for Synchronous Data-Flow Applications", journal = j-TRETS, volume = "5", number = "3", pages = "14:1--14:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362378", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Recent FPGAs allow the design of efficient and complex Heterogeneous Systems-on-Chip (HSoC). Namely, these systems are composed of several processors, hardware accelerators as well as communication media between all these components. Performances provided by HSoCs make them really interesting for data-flow applications, especially image processing applications. The use of this kind of architecture provides good performances but the drawback is an increase of the programming complexity. This complexity is due to the heterogeneous deployment of the application on the platform. Some functions are implemented in software to run on a processor, whereas other functions are implemented in hardware to run in a reconfigurable partition of the FPGA. This article aims to define a programming model based on the Synchronous Data-Flow model, in order to abstract the heterogeneity of the implementation and to leverage the communication issue between software and hardware actors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lusala:2012:STB, author = "Angelo Kuti Lusala and Jean-Didier Legat", title = "A {SDM--TDM}-Based Circuit-Switched Router for On-Chip Networks", journal = j-TRETS, volume = "5", number = "3", pages = "15:1--15:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362379", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article proposes a circuit-switched router that combines Spatial Division Multiplexing (SDM) and Time Division Multiplexing (TDM) in order to increase path diversity in the router while sharing channels among multiple connections. In this way, the probability of establishing paths through the network is increased, thereby significantly reducing contention in the network. Furthermore, Quality of Service (QoS) is easily guaranteed. The proposed router was synthesized on an Stratix III 3SL340F FPGA device. A 4 $ \times $ 4 2D Mesh SDM-TDM Network-on-Chip (NoC) was built with the proposed router and synthesized on the 3SL340F FPGA device. The 4 $ \times $ 4 2D Mesh SDM-TDM NoC was used to build on an FPGA device, a Multiprocessor System-on-Chip (MPSoC) platform consisted of 16 Nios II/f processors, 16 20-KB On-chip Memories, and 16 Network Interfaces. Synthesis results of the MPSoC platform show that the proposed router architecture can be used to built large practicable MPSoC platforms with the proposed NoC architecture with a reasonable hardware overhead and appreciable clock frequency. Simulation results show that combining SDM and TDM techniques in a router allows the highest probability of establishing paths through the network.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gaspar:2012:SEF, author = "Lubos Gaspar and Viktor Fischer and Lilian Bossuet and Robert Fouquet", title = "Secure Extension of {FPGA} General Purpose Processors for Symmetric Key Cryptography with Partial Reconfiguration Capabilities", journal = j-TRETS, volume = "5", number = "3", pages = "16:1--16:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362380", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In data security systems, general purpose processors (GPPs) are often extended by a cryptographic accelerator. The article presents three ways of extending GPPs for symmetric key cryptography applications. Proposed extensions guarantee secure key storage and management even if the system is facing protocol, software and cache memory attacks. The system is partitioned into processor, cipher, and key memory zones. The three security zones are separated at protocol, system, architecture and physical levels. The proposed principle was validated on Altera NIOS II, Xilinx MicroBlaze and Microsemi Cortex M1 soft-core processor extensions. We show that stringent separation of the cipher zone is helpful for partial reconfiguration of the security module, if the enciphering algorithm needs to be dynamically changed. However, the key zone including reconfiguration controller must remain static in order to maintain the high level of security required. We demonstrate that the principle is feasible in partially reconfigurable field programmable gate arrays (FPGAs) such as Altera Stratix V or Xilinx Virtex 6 and also to some extent in FPGAs featuring hardwired general purpose processors such as Cortex M3 in Microsemi SmartFusion FPGA. Although the three GPPs feature different data interfaces, we show that the processors with their extensions reach the required high security level while maintaining partial reconfiguration capability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ost:2012:EAT, author = "Luciano Ost and Sameer Varyani and Leandro Soares Indrusiak and Marcelo Mandelli and Gabriel Marchesan Almeida and Eduardo Wachter and Fernando Moraes and Gilles Sassatelli", title = "Enabling Adaptive Techniques in Heterogeneous {MPSoCs} Based on Virtualization", journal = j-TRETS, volume = "5", number = "3", pages = "17:1--17:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362381", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article explores the use of virtualization to enable mechanisms like task migration and dynamic mapping in heterogeneous MPSoCs, thereby targeting the design of systems capable of adapt their behavior to time-changing workloads. Because tasks may have to be mapped to target processors with different instruction set architectures, we propose the use of Low Level Virtual Machine (LLVM) to postcompile the tasks at runtime depending on their target processor. A novel dynamic mapping heuristic is also proposed, aiming to exploit the advantages of specialized processors while taking into account the overheads imposed by virtualization. Extensive experimental work at different levels of abstraction---FPGA prototype, RTL and system-level simulation---is presented to evaluate the proposed techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Morgan:2012:RFL, author = "Fearghal Morgan and Seamus Cawley and David Newell", title = "Remote {FPGA} Lab for Enhancing Learning of Digital Systems", journal = j-TRETS, volume = "5", number = "3", pages = "18:1--18:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2362374.2362382", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Nov 6 18:07:44 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Learning in digital systems can be enhanced through applying a learn-by-doing approach on practical hardware systems and by using Web-based technology to visualize and animate hardware behavior. The authors have reported the Web-based Remote FPGA Lab (RFL) which provides a novel, real-time control and visualization interface to a remote, always-on FPGA hardware implementation. The RFL helps students to understand and reason about digital systems operation, using interactive animation of signal behavior in an executing digital logic system, at any level of the design hierarchy. The RFL supports the creation of real-time interactive digital systems teaching demos. The article presents student RFL usage data and survey data which highlight improved student engagement, learning and achievement. The article describes the RFL architecture, communication interface, Web page functionality, user access administration and database management. The article also describes the RFLGen program, developed to automate user design integration into the Xilinx ISE VHDL-based RFL project wrapper for creation of FPGA configuration bitstreams and RFL animations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Krieg:2012:PMP, author = "Armin Krieg and Johannes Grinschgl and Christian Steger and Reinhold Weiss and Holger Bock and Josef Haid", title = "{POWER-MODES: POWer-EmulatoR- and MOdel-Based DEpendability and Security Evaluations}", journal = j-TRETS, volume = "5", number = "4", pages = "19:1--19:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2392616.2392617", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sun May 5 09:22:43 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Innovation cycles have been shortening significantly during the last years. This process puts tremendous pressure on designers of embedded systems for security-or reliability-critical applications. Eventual design problems not detected during design time can lead to lost money, confidentiality, or even loss of life in extreme cases. Therefore it is of vital importance to evaluate a new system for its robustness against intentionally and random induced operational faults. Currently this is generally done using extensive simulation runs using gate-level models or direct measurements on the finished silicon product. These approaches either need a significant amount of time and computational power for these simulations or rely on existing product samples. This article presents a novel system evaluation platform using power emulation and fault injection techniques to provide an additional tool for developers of embedded systems in security-and reliability-critical fields. Faults are emulated using state-of-the-art fault injection methods and a flexible pattern representation approach. The resulting effects of these faults on the power consumption profile are estimated using state-of-the-art power emulation hardware. A modular system augmentation approach provides emulation flexibility similar to fault simulation implementations. The platform enables the efficient evaluation of new hardware or software implementations of critical security or reliability solutions at an early development phase.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nabina:2012:AVS, author = "Atukem Nabina and Jose Luis Nunez-Yanez", title = "Adaptive Voltage Scaling in a Dynamically Reconfigurable {FPGA}-Based Platform", journal = j-TRETS, volume = "5", number = "4", pages = "20:1--20:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2392616.2392618", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sun May 5 09:22:43 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Power is an important issue limiting the applicability of Field Programmable Gate Arrays (FPGAs) since it is considered to be up to one order of magnitude higher than in ASICs. Recently, dynamic reconfiguration in FPGAs has emerged as a viable technique able to achieve power and cost reductions by time-multiplexing the required functionality at runtime. In this article, the applicability of Adaptive Voltage Scaling (AVS) to FPGAs is considered together with dynamic reconfiguration of logic and clock management resources to further improve the power profile of these devices. AVS is a popular power-saving technique in ASICs that enables a device to regulate its own voltage and frequency based on workload, fabrication, and operating conditions. The resulting processing platform exploits the available application-dependent timing margins to achieve a power reduction up to 85\% operating at 0.58 volts compared with operating at a nominal voltage of 1 volt. The results also show that the energy requirements at 0.58 volts are approximately five times lower compared with nominal voltage and this can be explained by the approximate cubic relation of static energy with voltage and the fact that the static component dominates power consumption in the considered FPGA devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jacobs:2012:RFT, author = "Adam Jacobs and Grzegorz Cieslewski and Alan D. George and Ann Gordon-Ross and Herman Lam", title = "Reconfigurable Fault Tolerance: a Comprehensive Framework for Reliable and Adaptive {FPGA}-Based Space Computing", journal = j-TRETS, volume = "5", number = "4", pages = "21:1--21:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2392616.2392619", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sun May 5 09:22:43 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Commercial SRAM-based, field-programmable gate arrays (FPGAs) have the potential to provide space applications with the necessary performance to meet next-generation mission requirements. However, mitigating an FPGA's susceptibility to single-event upset (SEU) radiation is challenging. Triple-modular redundancy (TMR) techniques are traditionally used to mitigate radiation effects, but TMR incurs substantial overheads such as increased area and power requirements. In order to reduce these overheads while still providing sufficient radiation mitigation, we propose a reconfigurable fault tolerance (RFT) framework that enables system designers to dynamically adjust a system's level of redundancy and fault mitigation based on the varying radiation incurred at different orbital positions. This framework includes an adaptive hardware architecture that leverages FPGA reconfigurable techniques to enable significant processing to be performed efficiently and reliably when environmental factors permit. To accurately estimate upset rates, we propose an upset rate modeling tool that captures time-varying radiation effects for arbitrary satellite orbits using a collection of existing, publicly available tools and models. We perform fault-injection testing on a prototype RFT platform to validate the RFT architecture and RFT performability models. We combine our RFT hardware architecture and the modeled upset rates using phased-mission Markov modeling to estimate performability gains achievable using our framework for two case-study orbits.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cancare:2012:EHC, author = "Fabio Cancare and Davide B. Bartolini and Matteo Carminati and Donatella Sciuto and Marco D. Santambrogio", title = "On the Evolution of Hardware Circuits via Reconfigurable Architectures", journal = j-TRETS, volume = "5", number = "4", pages = "22:1--22:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2392616.2392620", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sun May 5 09:22:43 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Traditionally, hardware circuits are realized according to techniques that follow the classical phases of design and testing. A completely new approach in the creation of hardware circuits has been proposed---the Evolvable Hardware (EHW) paradigm, which bases the circuit synthesis on a goal-oriented evolutionary process inspired by biological evolution in Nature. FPGA-based approaches have emerged as the main architectural solution to implement EHW systems. Various EHW systems have been proposed by researchers but most of them, being based on outdated chips, do not take advantage of the interesting features introduced in newer FPGAs. This article describes a project named Hardware Evolution over Reconfigurable Architectures (HERA), which aims at creating a complete and performance-oriented framework for the evolution of digital circuits, leveraging the reconfiguration technology available in FPGAs. The project is described from its birth to its current state, presenting its evolutionary technique tailored for FPGA-based circuits and the most recent enhancements to improve the scalability with respect to problem size. The developed EHW system outperforms the state of the art, proving its effectiveness in evolving both standard benchmarks and more complex real-world applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ould-Bachir:2013:SAS, author = "Tarek Ould-Bachir and Jean Pierre David", title = "Self-Alignment Schemes for the Implementation of Addition-Related Floating-Point Operators", journal = j-TRETS, volume = "6", number = "1", pages = "1:1--1:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457443.2457444", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:42 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Advances in semiconductor technology brings to the market incredibly dense devices, capable of handling tens to hundreds floating-point operators on a single chip; so do the latest field programmable gate arrays (FPGAs). In order to alleviate the complexity of resorting to these devices in computationally intensive applications, this article proposes hardware schemes for the realization of addition-related floating-point operators based on the self-alignment technique (SAT). The article demonstrates that the schemes guarantee an accuracy as if summation was computed accurately in the precision of operator's internal mantissa, then faithfully rounded to working precision. To achieve such performance, the article adopts the redundant high radix carry-save (HRCS) format for the rapid addition of wide mantissas. Implementation results show that combining the SAT and the HRCS format allows the implementation of complex operators with reduced area and latency, more so when a fused-path approach is adopted. The article also proposes a new hardware operator for performing endomorphic HRCS additions and presents a new technique for speeding up the conversion from the redundant HRCS to a conventional binary format.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2013:FBA, author = "Yan Zhang and Fan Zhang and Zheming Jin and Jason D. Bakos", title = "An {FPGA-Based} Accelerator for Frequent Itemset Mining", journal = j-TRETS, volume = "6", number = "1", pages = "2:1--2:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457443.2457445", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:42 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article we describe a Field Programmable Gate Array (FPGA)-based coprocessor architecture for Frequent Itemset Mining (FIM). FIM is a common data mining task used to find frequently occurring subsets amongst a database of sets. FIM is a nonnumerical, data intensive computation and is used in machine learning and computational biology. FIM is particularly expensive---in terms of execution time and memory---when performed on large and/or sparse databases or when applied using a low appearance frequency threshold. Because of this, the development of increasingly efficient FIM algorithms and their mapping to parallel architectures is an active field. Previous attempts to accelerate FIM using FPGAs have relied on performance-limiting strategies such as iterative database loading and runtime logic unit reconfiguration. In this article, we present a novel architecture to implement Eclat, a well-known FIM algorithm. Unlike previous efforts, our technique does not impose limits on the maximum set size as a function of available FPGA logic resources and our design scales well to multiple FPGAs. In addition to a novel hardware design, we also present a corresponding compression scheme for intermediate results that are stored in on-chip memory. On a four-FPGA board, experimental results show up to 68X speedup compared to a highly optimized software implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Meeuws:2013:QSM, author = "Roel Meeuws and S. Arash Ostadzadeh and Carlo Galuzzi and Vlad Mihai Sima and Razvan Nane and Koen Bertels", title = "{Quipu}: a Statistical Model for Predicting Hardware Resources", journal = j-TRETS, volume = "6", number = "1", pages = "3:1--3:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457443.2457446", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:42 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "There has been a steady increase in the utilization of heterogeneous architectures to tackle the growing need for computing performance and low-power systems. The execution of computation-intensive functions on specialized hardware enables to achieve substantial speedups and power savings. However, with a large legacy code base and software engineering experts, it is not at all obvious how to easily utilize these new architectures. As a result, there is a need for comprehensive tool support to bridge the knowledge gap of many engineers as well as to retarget legacy code. In this article, we present the Quipu modeling approach, which consists of a set of tools and a modeling methodology that can generate hardware estimation models, which provide valuable information for developers. This information helps to focus their efforts, to partition their application, and to select the right heterogeneous components. We present Quipu 's capability to generate domain-specific models, that are up to several times more accurate within their particular domain (error: 4.6\%) as compared to domain-agnostic models (error: 23\%). Finally, we show how Quipu can generate models for a new toolchain and platform within a few days.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{deDinechin:2013:FPE, author = "Florent de Dinechin and Pedro Echeverr{\'\i}a and Marisa L{\'o}pez-Vallejo and Bogdan Pasca", title = "Floating-Point Exponentiation Units for Reconfigurable Computing", journal = j-TRETS, volume = "6", number = "1", pages = "4:1--4:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457443.2457447", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:42 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The high performance and capacity of current FPGAs makes them suitable as acceleration co-processors. This article studies the implementation, for such accelerators, of the floating-point power function $ x^y $ as defined by the C99 and IEEE 754-2008 standards, generalized here to arbitrary exponent and mantissa sizes. Last-bit accuracy at the smallest possible cost is obtained thanks to a careful study of the various subcomponents: a floating-point logarithm, a modified floating-point exponential, and a truncated floating-point multiplier. A parameterized architecture generator in the open-source FloPoCo project is presented in details and evaluated.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Neely:2013:RTH, author = "Christopher E. Neely and Gordon Brebner and Weijia Shang", title = "{ReShape}: Towards a High-Level Approach to Design and Operation of Modular Reconfigurable Systems", journal = j-TRETS, volume = "6", number = "1", pages = "5:1--5:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457443.2457448", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:42 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The latest FPGA devices provide the headroom to implement large-scale and complex systems. A key requirement is the integration of modules from diverse sources to promote modular design and reuse. A contrary factor is that using dynamic partial reconfiguration typically requires low-level planning of the system implementation. In this article, we introduce ReShape: a high-level approach for designing reconfigurable systems by interconnecting modules, which gives a ``plug and play'' look and feel, is supported by tools that carry out implementation functions, and is carried through to support system reconfiguration during operation. The emphasis is on the inter-module connections and abstracting the communication patterns that are typical between modules: for example, the streaming of data, or the reading and writing of data to and from memory modules. The details of wiring and signaling are hidden from view, via metadata associated with individual modules. This setting allows system reconfiguration at the module level, both by supporting type checking of replacement modules and by managing the overall system implementation, via metadata associated with its FPGA floorplan. The methodology and tools have been implemented in a prototype targeted to a domain-specific setting---high-speed networking---and have been validated on real telecommunications design projects.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Goehringer:2013:ISS, author = "Diana Goehringer and Ren{\'e} Cumplido", title = "Introduction to the special section on {19th Reconfigurable Architectures Workshop (RAW 2012)}", journal = j-TRETS, volume = "6", number = "2", pages = "6:1--6:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499625.2499626", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sidiropoulos:2013:JFS, author = "Harry Sidiropoulos and Kostas Siozios and Peter Figuli and Dimitrios Soudris and Michael H{\"u}bner and J{\"u}rgen Becker", title = "{JITPR}: a framework for supporting fast application's implementation onto {FPGAs}", journal = j-TRETS, volume = "6", number = "2", pages = "7:1--7:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492185", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The execution runtime usually is a headache for designers performing application mapping onto reconfigurable architectures. In this article we propose a methodology, as well as the supporting toolset, targeting to provide fast application implementation onto reconfigurable architectures with the usage of a Just-In-Time (JIT) compilation framework. Experimental results prove the efficiency of the introduced framework, as we reduce the execution runtime compared to the state-of-the-art approach on average by 53.5$ \times $. Additionally, the derived solutions achieve higher operation frequencies by 1.17$ \times $, while they also exhibit significant lower fragmentation ratios of hardware resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Heisswolf:2013:VND, author = "Jan Heisswolf and Aurang Zaib and Andreas Weichslgartner and Ralf K{\"o}nig and Thomas Wild and J{\"u}rgen Teich and Andreas Herkersdorf and J{\"u}rgen Becker", title = "Virtual networks --- distributed communication resource management", journal = j-TRETS, volume = "6", number = "2", pages = "8:1--8:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492186", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Networks-on-Chip (NoC) enable scalability for future manycore architectures, facilitating parallel communication between multiple cores. Applications running in parallel on a NoC-based architecture can affect each other due to overlapping communication. Quality-of-Service (QoS) must be supported by the communication infrastructure to execute communication-, real-time- and safety-critical applications on such an architecture. Different strategies have been proposed to provide QoS for point-to-point connections. These strategies allow each node to set up a limited number of connections to other nodes. In this work Virtual Networks (VN) are proposed to enable QoS for regions of a NoC-based architecture. Virtual Networks overcome the limitation of point-to-point connections. A VN behaves like an exclusive physical network. Virtual Networks can be defined and configured during runtime. The size of the VN region and the assigned bandwidth can be adjusted depending on the application requirements. Virtual Networks enable the decoupling of local from global communication. Therefore, the communication of the application mapped into the region is assigned to a Virtual Network established in that specific region. This concept targets packet-switched networks with virtual channels and is realized by an intelligent hardware unit that manages the virtual channel reservation process at system runtime. Virtual Networks can be established and administrated independent of each other, enabling distributed communication resource management. The proposed concept is implemented as a cycle-accurate SystemC simulation model. The simulation results of executing communicating graphs obtained from real application highlight the usefulness of Virtual Networks by showing improved throughput and reduced delay in the respective scenarios. A hardware implementation demonstrates a low impact on area utilization and power consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ganegedara:2013:CPA, author = "Thilan Ganegedara and Viktor Prasanna", title = "A comprehensive performance analysis of virtual routers on {FPGA}", journal = j-TRETS, volume = "6", number = "2", pages = "9:1--9:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492187", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Network virtualization has gained much popularity with the advent of datacenter networking. The hardware aspect of network virtualization, router virtualization, allows network service providers to consolidate network hardware, reducing equipment cost and management overhead. Several approaches have been proposed to achieve router virtualization to support several virtual networks on a single hardware platform. However, their performance has not been analyzed quantitatively to understand the benefits of each approach. In this work, we perform a comprehensive analysis of performance of these approaches on Field Programmable Gate Array (FPGA) with respect to memory consumption, throughput, and power consumption. Generalized versions of virtualization approaches are evaluated based on post place-and-route results on a state-of-the-art FPGA. Grouping of routing tables is proposed as a novel approach to improve scalability (i.e., the number of virtual networks hosted on a single chip) of virtual routers on FPGA with respect to memory requirement. Further, we employ floor-planning techniques to efficiently utilize chip resources and achieve high performance for virtualized, pipelined architectures, resulting in 1.6$ \times $ speedup on the average compared with the non-floor-planned approach. The results indicate that the proposed solution is able to support 100+ and 50 virtual routers per chip in the near-best and near-worst case scenarios, while operating at 20+ Gbps rates.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Das:2013:TDA, author = "Joydip Das and Steven J. E. Wilton", title = "Towards development of an analytical model relating {FPGA} architecture parameters to routability", journal = j-TRETS, volume = "6", number = "2", pages = "10:1--10:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499625.2499627", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We present an analytical model relating FPGA architectural parameters to the routability of the FPGA. The inputs to the model include the channel width and the connection and the switch block flexibilities. The output is an estimate of the proportion of nets in a large circuit that can be expected to be successfully routed on the FPGA. We assume that the circuit is routed to the FPGA using a single-step combined global/detailed router. We show that the model correctly predicts routability trends. We also present an example application to demonstrate that this model may be a valuable tool for FPGA architects. When combined with the earlier works on analytical modeling, our model can be used to quickly predict the routability without going through any stage of an expensive CAD flow. We envisage that this model will benefit FPGA architecture designers and vendors to quickly evaluate FPGA routing fabrics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Huang:2013:VHS, author = "Chun-Hsian Huang and Pao-Ann Hsiung", title = "Virtualizable hardware\slash software design infrastructure for dynamically partially reconfigurable systems", journal = j-TRETS, volume = "6", number = "2", pages = "11:1--11:??", month = jul, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499625.2499628", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "In most existing works, reconfigurable hardware modules are still managed as conventional hardware devices. Further, the software reconfiguration overhead incurred by loading corresponding device drivers into the kernel of an operating system has been overlooked until now. As a result, the enhancement of system performance and the utilization of reconfigurable hardware modules are still quite limited. This work proposes a virtualizable hardware/software design infrastructure (VDI) for dynamically partially reconfigurable systems. Besides the gate-level hardware virtualization provided by the partial reconfiguration technology, VDI supports the device-level hardware virtualization. In VDI, a reconfigurable hardware module can be virtualized such that it can be accessed efficiently by multiple applications in an interleaving way. A Hot-Plugin Connector (HPC) replaces the conventional device driver, such that it not only assists the device-level hardware virtualization but can also be reused across different hardware modules. To facilitate hardware/software communication and to enhance system scalability, the proposed VDI is realized as a hierarchical design framework. User-designed reconfigurable hardware modules can be easily integrated into VDI, and are then executed as hardware tasks in an operating system for reconfigurable systems (OS4RS). A dynamically partially reconfigurable network security system was designed using VDI, which demonstrated a higher utilization of reconfigurable hardware modules and a reduction by up to 12.83\% of the processing time required by using the conventional method in a dynamically partially reconfigurable system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2013:INL, author = "Hanyu Liu and Senthilkumar T. Rajavel and Ali Akoglu", title = "Integration of Net-Length Factor with Timing- and Routability-Driven Clustering Algorithms", journal = j-TRETS, volume = "6", number = "3", pages = "12:1--12:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2517324", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:45 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In FPGA CAD flow, the clustering stage builds the foundation for placement and routing stages and affects performance parameters, such as routability, delay, and channel width significantly. Net sharing and criticality are the two most commonly used factors in clustering cost functions. With this study, we first derive a third term, net-length factor, and then design a generic method for integrating net length into the clustering algorithms. Net-length factor enables characterizing the nets based on the routing stress they might cause during later stages of the CAD flow and is essential for enhancing the routability of the design. We evaluate the effectiveness of integrating net length as a factor into the well-known timing (T-VPack)-, depopulation (T-NDPack)-, and routability (iRAC and T-RPack)-driven clustering algorithms. Through exhaustive experimental studies, we show that net-length factor consistently helps improve the channel-width performance of routability-, depopulation-, and timing-driven clustering algorithms that do not explicitly target low fan-out nets in their cost functions. Particularly, net-length factor leads to average reduction in channel width for T-VPack, T-RPack, and T-NDPack by 11.6\%, 10.8\%, and 14.2\%, respectively, and in a majority of the cases, improves the critical-path delay without increasing the array size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mehta:2013:UGE, author = "Gayatri Mehta and Carson Crawford and Xiaozhong Luo and Natalie Parde and Krunalkumar Patel and Brandon Rodgers and Anil Kumar Sistla and Anil Yadav and Marc Reisner", title = "{UNTANGLED}: a Game Environment for Discovery of Creative Mapping Strategies", journal = j-TRETS, volume = "6", number = "3", pages = "13:1--13:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2517325", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:45 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The problem of creating efficient mappings of dataflow graphs onto specific architectures (i.e., solving the place and route problem) is incredibly challenging. The difficulty is especially acute in the area of Coarse-Grained Reconfigurable Architectures (CGRAs) to the extent that solving the mapping problem may remove a significant bottleneck to adoption. We believe that the next generation of mapping algorithms will exhibit pattern recognition, the ability to learn from experience, and identification of creative solutions, all of which are human characteristics. This manuscript describes our game UNTANGLED, developed and fine-tuned over the course of a year to allow us to capture and analyze human mapping strategies. It also describes our results to date. We find that the mapping problem can be crowdsourced very effectively, that players can outperform existing algorithms, and that successful player strategies share many elements in common. Based on our observations and analysis, we make concrete recommendations for future research directions for mapping onto CGRAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hormigo:2013:SRC, author = "Javier Hormigo and Gabriel Caffarena and Juan P. Oliver and Eduardo Boemo", title = "Self-Reconfigurable Constant Multiplier for {FPGA}", journal = j-TRETS, volume = "6", number = "3", pages = "14:1--14:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490830", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:45 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Constant multipliers are widely used in signal processing applications to implement the multiplication of signals by a constant coefficient. However, in some applications, this coefficient remains invariable only during an interval of time, and then, its value changes to adapt to new circumstances. In this article, we present a self-reconfigurable constant multiplier suitable for LUT-based FPGAs able to reload the constant in runtime. The pipelined architecture presented is easily scalable to any multiplicand and constant sizes, for unsigned and signed representations. It can be reprogrammed in 16 clock cycles, equivalent to less than 100 ns in current FPGAs. This value is significantly smaller than FPGA partial configuration times. The presented approach is more efficient in terms of area and speed when compared to generic multipliers, achieving up to 91\% area reduction and up to 102\% speed improvement for the case-study circuits tested. The power consumption of the proposed multipliers are in the range of those of slice-based multipliers provided by the vendor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gharibian:2013:ASL, author = "Farnaz Gharibian and Lesley Shannon and Peter Jamieson and Kevin Chung", title = "Analyzing System-Level Information's Correlation to {FPGA} Placement", journal = j-TRETS, volume = "6", number = "3", pages = "15:1--15:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501985", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:45 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "One popular placement algorithms for Field-Programmable Gate Arrays (FPGAs) is called Simulated Annealing (SA). This algorithm tries to create a good quality placement from a flattened design that no longer contains any high-level information related to the original design hierarchy. Placement is an NP-hard problem, and as the size and complexity of designs implemented on FPGAs increases, SA does not scale well to find good solutions in a timely fashion. In this article, we investigate if system-level information can be reconstructed from a flattened netlist and evaluate how that information is realized in terms of its locality in the final placement. If there is a strong relationship between good quality placements and system-level information, then it may be possible to divide a large design into smaller components and improve the time needed to create a good quality placement. Our preliminary results suggest that the locality property of the information embedded in the system-level HDL structure (i.e. ``module'', ``always'', and ``if'' statements) is greatly affected by designer HDL coding style. Therefore, a reconstructive algorithm, called Affinity Propagation, is also considered as a possible method of generating a meaningful coarse-grain picture of the design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Plavec:2013:ETD, author = "Franjo Plavec and Zvonko Vranesic and Stephen Brown", title = "Exploiting Task- and Data-Level Parallelism in Streaming Applications Implemented in {FPGAs}", journal = j-TRETS, volume = "6", number = "4", pages = "16:1--16:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2535932", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:46 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article describes the design and implementation of a novel compilation flow that implements circuits in FPGAs from a streaming programming language. The streaming language supported is called FPGA Brook and is based on the existing Brook language. It allows system designers to express applications in a way that exposes parallelism, which can be exploited through hardware implementation. FPGA Brook supports replication, allowing parts of an application to be implemented as multiple hardware units operating in parallel. Hardware units are interconnected through FIFO buffers which use the small memory modules available in FPGAs. The FPGA Brook automated design flow uses a source-to-source compiler, developed as a part of this work, and combines it with a commercial behavioral synthesis tool to generate the hardware implementation. A suite of benchmark applications was developed in FPGA Brook and implemented using our design flow. Experimental results indicate that performance of many applications scales well with replication. Our benchmark applications also achieve significantly better results than corresponding implementations using a commercial behavioral synthesis tool. We conclude that using an automated design flow for implementation of streaming applications in FPGAs is a promising methodology.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ananthan:2013:RPH, author = "T. Ananthan and M. V. Vaidyan", title = "A Reconfigurable Parallel Hardware Implementation of the Self-Tuning Regulator", journal = j-TRETS, volume = "6", number = "4", pages = "17:1--17:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2535934", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:46 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The self-tuning regulator (STR) is a popular adaptive control algorithm. A high-performance computer is required for its implementation due to the heavy online computational burden. To extend STR for more real-time applications, a parallel hardware implementation on a low-cost reconfigurable computer is presented. The hardware was incorporated with multistage matrix multiplication (MMM) and trace technique to enhance the processing speed. This design was deeply pipelined to achieve high throughput. The algorithm was prototyped on a Xilinx field-programmable gate array (FPGA) device with a maximum operating frequency of 210.436 MHz. Application-specific integrated circuit (ASIC) implementation of STR was reported.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leow:2013:AME, author = "Yoon Kah Leow and Ali Akoglu and Susan Lysecky", title = "An Analytical Model for Evaluating Static Power of Homogeneous {FPGA} Architectures", journal = j-TRETS, volume = "6", number = "4", pages = "18:1--18:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2535935", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:46 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "As capacity of the field-programmable gate arrays (FPGAs) continues to increase, power dissipated in the logic and routing resources has become a critical concern for FPGA architects. Recent studies have shown that static power is fast approaching the dynamic power in submicron devices. In this article, we propose an analytical model for relating homogeneous island-style-based FPGA architecture to static power. Current FPGA power models are tightly coupled with CAD tools. Our CAD-independent model captures the static power for a given FPGA architecture based on estimates of routing and logic resource utilizations from a pre-technology mapped netlist. We observe an average correlation ratio (C-Ratio) of 95\% and a minimum absolute percentage error (MAPE) rate of 15\% with respect to the experimental results generated by the Versatile Placement Routing (VPR) tool over the MCNC benchmarks. Our model offers application engineers and FPGA architects the capability to evaluate the impact of their design choices on static power without having to go through CAD-intensive investigations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ben-Asher:2013:OWS, author = "Yosi Ben-Asher and Ron Meldiner and Nadav Rotem", title = "Optimizing Wait States in the Synthesis of Memory References with Unpredictable Latencies", journal = j-TRETS, volume = "6", number = "4", pages = "19:1--19:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2535936", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:46 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We consider the problem of synthesizing circuits (from C to Verilog) that are optimized to handle unpredictable latencies of memory operations. Unpredictable memory latencies can occur due to the use of on chip caches, DRAM memory modules, buffers/queues, or multiport memories. Typically, high-level synthesis compilers assume fixed and known memory latencies, and thus are able to schedule the code's operations efficiently. The operations in the source code are scheduled into states of a state machine whose states will be synthesized to Verilog. The goal is to minimize scheduling length by maximizing the number of operations (and in particular memory operations) that are executed in parallel at the same state. However, with unpredictable latencies, there can be an exponential number of possible orders in which these parallel memory operations can terminate. Thus, in order to minimize the scheduling, we need a different schedule for any such order. This is not practical, and we show a technique of synthesizing a compact state machine that schedules only a small subset of these possible termination orders. Our results show that this compact state machine can improve the execution time compared to a regular scheduling that waits for the termination of all the active memory references in every state.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kornaros:2014:DPT, author = "George Kornaros and Dionisios Pnevmatikatos", title = "Dynamic Power and Thermal Management of {NoC-Based} Heterogeneous {MPSoCs}", journal = j-TRETS, volume = "7", number = "1", pages = "1:1--1:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567658", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:47 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Advances in silicon process technology have made it possible to include multiple processor cores on a single die. Billion transistor architectures usually in the form of networks-on-chip present a wide range of challenges in design, microarchitecture, and algorithmic levels with significant impact to system performance and power consumption. In this article, we propose efficient methods and mechanisms that exploit a heterogeneous network-on-chip (NoC) to achieve a power- and thermal-aware coherent system. To this end, we utilize different management techniques which employ dynamic frequency scaling circuitry and power and temperature sensors per node to achieve real-time workload prediction and allocation at node and system level by low-cost threads. The developed heterogeneous multicoprocessing infrastructure is utilized to evaluate diverse policies for power-aware computing in terms of effectiveness and in relation to distributed sensor-conscious management. The proposed reconfigurable architecture supports coprocessor accelerators per node, monitors the program's power profile on-the-fly, and balances power and thermal behavior at the NoC level. Overall, these techniques form a system exploration methodology using a multi-FPGA emulation platform showing a minimum complexity overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Iskander:2014:HLA, author = "Yousef Iskander and Cameron Patterson and Stephen Craven", title = "High-Level Abstractions and Modular Debugging for {FPGA} Design Validation", journal = j-TRETS, volume = "7", number = "1", pages = "2:1--2:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567662", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:47 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Design validation is the most time-consuming task in the FPGA design cycle. Although manufacturers and third-party vendors offer a range of tools that provide visibility and control of the different stages of a design, many require that the design be fully re-implemented for even simple parameter modifications or do not allow the design to be run at full speed. Designs are typically first modeled using a high-level language then later rewritten in a hardware description language, first for simulation and then later modified for synthesis. IP and third-party cores may differ during these final two stages complicating development and validation. The developed approach provides two means of directly validating synthesized hardware designs. The first allows the original high-level model written in C or C++ to be directly coupled to the synthesized hardware, abstracting away the traditional gate-level view of designs. A high-level programmatic interface allows the synthesized design to be validated directly by the software reference model. The second approach provides an alternative view to FPGAs within the scope of a traditional software debugger. This debug framework leverages partially reconfigurable regions to accelerate the modification of dynamic, software-like breakpoints for low-level analysis and provides a automatable, scriptable, command-line interface directly to a running design on an FPGA.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jin:2014:FAS, author = "Minxi Jin and Tsutomu Maruyama", title = "Fast and Accurate Stereo Vision System on {FPGA}", journal = j-TRETS, volume = "7", number = "1", pages = "3:1--3:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567659", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:47 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we present a fast and high quality stereo matching algorithm on FPGA using cost aggregation (CA) and fast locally consistent (FLC) dense stereo. In many software programs, global matching algorithms are used in order to obtain accurate disparity maps. Although their error rates are considerably low, their processing speeds are far from that required for real-time processing because of their complex processing sequences. In order to realize real-time processing, many hardware systems have been proposed to date. They have achieved considerably high processing speeds; however, their error rates are not as good as those of software programs, because simple local matching algorithms have been widely used in those systems. In our system, sophisticated local matching algorithms (CA and FLC) that are suitable for FPGA implementation are used to achieve low error rate while maintaining the high processing speed. We evaluate the performance of our circuit on Xilinx Vertex-6 FPGAs. Its error rate is comparable to that of top-level software algorithms, and its processing speed is nearly 2 clock cycles per pixel, which reaches 507.9 fps for 640 480 pixel images.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ulusel:2014:FDE, author = "Onur Ulusel and Kumud Nepal and R. Iris Bahar and Sherief Reda", title = "Fast Design Exploration for Performance, Power and Accuracy Tradeoffs in {FPGA-Based} Accelerators", journal = j-TRETS, volume = "7", number = "1", pages = "4:1--4:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567661", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:47 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The ease-of-use and reconfigurability of FPGAs makes them an attractive platform for accelerating algorithms. However, accelerating becomes a challenging task as the large number of possible design parameters lead to different accelerator variants. In this article, we propose techniques for fast design exploration and multi-objective optimization to quickly identify both algorithmic and hardware parameters that optimize these accelerators. This information is used to run regression analysis and train mathematical models within a nonlinear optimization framework to identify the optimal algorithm and design parameters under various objectives and constraints. To automate and improve the model generation process, we propose the use of L$_1$ -regularized least squares regression techniques.We implement two real-time image processing accelerators as test cases: one for image deblurring and one for block matching. For these designs, we demonstrate that by sampling only a small fraction of the design space (0.42\% and 1.1\%), our modeling techniques are accurate within 2\%--4\% for area and throughput, 8\%--9\% for power, and 5\%--6\% for arithmetic accuracy. We show speedups of 340$ \times $ and 90$ \times $ in time for the test cases compared to brute-force enumeration. We also identify the optimal set of parameters for a number of scenarios (e.g., minimizing power under arithmetic inaccuracy bounds).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kim:2014:FPF, author = "Lok-Won Kim and Sameh Asaad and Ralph Linsker", title = "A Fully Pipelined {FPGA} Architecture of a Factored Restricted {Boltzmann} Machine Artificial Neural Network", journal = j-TRETS, volume = "7", number = "1", pages = "5:1--5:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2539125", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Mar 13 08:09:47 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Artificial neural networks (ANNs) are a natural target for hardware acceleration by FPGAs and GPGPUs because commercial-scale applications can require days to weeks to train using CPUs, and the algorithms are highly parallelizable. Previous work on FPGAs has shown how hardware parallelism can be used to accelerate a ``Restricted Boltzmann Machine'' (RBM) ANN algorithm, and how to distribute computation across multiple FPGAs. Here we describe a fully pipelined parallel architecture that exploits ``mini-batch'' training (combining many input cases to compute each set of weight updates) to further accelerate ANN training. We implement on an FPGA, for the first time to our knowledge, a more powerful variant of the basic RBM, the ``Factored RBM'' (fRBM). The fRBM has proved valuable in learning transformations and in discovering features that are present across multiple types of input. We obtain (in simulation) a 100-fold acceleration (vs. CPU software) for an fRBM having N = 256 units in each of its four groups (two input, one output, one intermediate group of units) running on a Virtex-6 LX760 FPGA. Many of the architectural features we implement are applicable not only to fRBMs, but to basic RBMs and other ANN algorithms more broadly.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Luu:2014:VNG, author = "Jason Luu and Jeffrey Goeders and Michael Wainberg and Andrew Somerville and Thien Yu and Konstantin Nasartschuk and Miad Nasr and Sen Wang and Tim Liu and Nooruddin Ahmed and Kenneth B. Kent and Jason Anderson and Jonathan Rose and Vaughn Betz", title = "{VTR 7.0}: Next Generation Architecture and {CAD} System for {FPGAs}", journal = j-TRETS, volume = "7", number = "2", pages = "6:1--6:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617593", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Exploring architectures for large, modern FPGAs requires sophisticated software that can model and target hypothetical devices. Furthermore, research into new CAD algorithms often requires a complete and open source baseline CAD flow. This article describes recent advances in the open source Verilog-to-Routing (VTR) CAD flow that enable further research in these areas. VTR now supports designs with multiple clocks in both timing analysis and optimization. Hard adder/carry logic can be included in an architecture in various ways and significantly improves the performance of arithmetic circuits. The flow now models energy consumption, an increasingly important concern. The speed and quality of the packing algorithms have been significantly improved. VTR can now generate a netlist of the final post-routed circuit which enables detailed simulation of a design for a variety of purposes. We also release new FPGA architecture files and models that are much closer to modern commercial architectures, enabling more realistic experiments. Finally, we show that while this version of VTR supports new and complex features, it has a 1.5$ \times $ compile time speed-up for simple architectures and a 6$ \times $ speed-up for complex architectures compared to the previous release, with no degradation to timing or wire-length quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{J:2014:MAN, author = "Soumya J. and Ashish Sharma and Santanu Chattopadhyay", title = "Multi-Application Network-on-Chip Design using Global Mapping and Local Reconfiguration", journal = j-TRETS, volume = "7", number = "2", pages = "7:1--7:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2556944", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article proposes a reconfigurable Network-on-Chip (NoC) architecture based on mesh topology. It provides a local reconfiguration of cores to connect to any of the neighboring routers, depending upon the currently executing application. The area overhead for this local reconfiguration has been shown to be very small. We have also presented the strategy to map the cores of an application set onto this architecture. This has been achieved via a two-phase procedure. In the first phase, the cores of the combined application set are mapped tentatively to individual routers, minimizing the communication cost. In the second phase, for each application, positions of individual cores are finalized. A core gets attached to any neighbor of its tentative allocation. We have proposed Integer Linear Programming (ILP) formulation of both the phases. Since ILP takes large amount of CPU time, we have also formulated a Particle Swarm Optimization (PSO)-based solution for the two phases. A heuristic approach has also been developed for the reconfiguration. Comparison of communication cost, latency and network energy have been carried out for the applications, before and after reconfiguration. It shows significant improvement in performance via reconfiguration.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lei:2014:FIS, author = "Yuanwu Lei and Lei Guo and Yong Dou and Sheng Ma and Jinbo Xu", title = "{FPGA} Implementation of a Special-Purpose {VLIW} Structure for Double-Precision Elementary Function", journal = j-TRETS, volume = "7", number = "2", pages = "8:1--8:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617594", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In the current article, the capability and flexibility of field programmable gate-arrays (FPGAs) to implement IEEE-754 double-precision floating-point elementary functions are explored. To perform various elementary functions on the unified hardware efficiently, we propose a special-purpose very long instruction word (VLIW) processor, called DP_VELP. This processor is equipped with multiple basic units, and its performance is improved through an explicitly parallel technique. Pipelined evaluation of polynomial approximation with Estrin's scheme is proposed, by scheduling basic components in an optimal order to avoid data hazard stalls and achieve minimal latency. The custom VLIW processor can achieve high scalability. Under the control of specific VLIW instructions, the basic units are combined into special-purpose hardware for elementary functions. Common elementary functions are presented as examples to illustrate the design of elementary function in DP_VELP in detail. Minimax approximation scheme is used to reduce degree of polynomial. Compromise between the size of lookup table and the latency is discussed, and the internal precision is carefully planned to guarantee accuracy of the result. Finally, we create a prototype of the DP_VELP unit and an FPGA accelerator based on the DP_VELP unit on a Xilinx XC6VLX760 FPGA chip to implement the SGP4/SDP4 application. Compared with previous researches, the proposed design can achieve low latency with a reasonable amount of resources and evaluate a variety of elementary functions with the unified hardware to satisfy the demands in scientific applications. Experimental results show that the proposed design guarantees more than 99\% of correct rounding. Moreover, the SGP4/SDP4 accelerator, which is equipped with 39 DP_VELP units and runs at 200 MHz, outperforms the parallel software approach with hyper-thread technology on an Intel Xeon Quad E5620 CPU at 2.40 GHz by a factor of 7X.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Clemente:2014:MSA, author = "Juan Antonio Clemente and Ivan Beretta and Vincenzo Rana and David Atienza and Donatella Sciuto", title = "A Mapping-Scheduling Algorithm for Hardware Acceleration on Reconfigurable Platforms", journal = j-TRETS, volume = "7", number = "2", pages = "9:1--9:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2611562", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Reconfigurable platforms are a promising technology that offers an interesting trade-off between flexibility and performance, which many recent embedded system applications demand, especially in fields such as multimedia processing. These applications typically involve multiple ad-hoc tasks for hardware acceleration, which are usually represented using formalisms such as Data Flow Diagrams (DFDs), Data Flow Graphs (DFGs), Control and Data Flow Graphs (CDFGs) or Petri Nets. However, none of these models is able to capture at the same time the pipeline behavior between tasks (that therefore can coexist in order to minimize the application execution time), their communication patterns, and their data dependencies. This article proves that the knowledge of all this information can be effectively exploited to reduce the resource requirements and the timing performance of modern reconfigurable systems, where a set of hardware accelerators is used to support the computation. For this purpose, this article proposes a novel task representation model, named Temporal Constrained Data Flow Diagram (TCDFD), which includes all this information. This article also presents a mapping-scheduling algorithm that is able to take advantage of the new TCDFD model. It aims at minimizing the dynamic reconfiguration overhead while meeting the communication requirements among the tasks. Experimental results show that the presented approach achieves up to 75\% of resources saving and up to 89\% of reconfiguration overhead reduction with respect to other state-of-the-art techniques for reconfigurable platforms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hoang:2014:IMD, author = "Anh-Tuan Hoang and Takeshi Fujino", title = "Intra-Masking Dual-Rail Memory on {LUT} Implementation for {SCA}-Resistant {AES} on {FPGA}", journal = j-TRETS, volume = "7", number = "2", pages = "10:1--10:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617595", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In current countermeasure design trends against differential power analysis (DPA), security at gate level is required in addition to the security algorithm. Several dual-rail pre-charge logics (DPL) have been proposed to achieve this goal. Designs using ASIC can attain this goal owing to its backend design restrictions on placement and routing. However, implementing these designs on field programmable gate arrays (FPGA) without information leakage is still a problem because of the difficulty involved in the restrictions on placement and routing on FPGA. This article describes our novel masked dual-rail pre-charged memory approach, called `intra-masking dual-rail memory (IMDRM) on LUT', and its implementation on FPGA for Side-Channel Attack-resistant (SCA-resistant) AES. In the proposed design, all unsafe nodes, such as unmasking and masking, and parts of dual-rail memory with unsafe buses (buses that are not masked) are packed into a single LUT. This makes them balanced and independent of the placement and routing tools. Inputs and outputs of all LUTs are masked, and so can be considered safe signals. Several LUTs can be combined to create a safe SBox. The design is independent of the cryptographic algorithm, and hence, it can be applied to available cryptographic standards such as DES or AES as well as future standards. It requires no special placement or route constraints in its implementation. A correlation power analysis (CPA) attack on 1,000,000 traces of AES implementation on FPGA showed that the secret information is well protected against first-order side-channel attacks. Even though the number of LUTs used for memory in this implementation is seven times greater than that of the conventional unprotected single-rail memory table-lookup AES and three times greater than the implementation based on a composite field, it requires a smaller number of LUTs than all other advanced SCA-resistant implementations such as the wave dynamic differential logic, masked dual-rail pre-charge logic, and threshold.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Becker:2014:ITS, author = "Tobias Becker", title = "Introduction to the {TRETS} Special Section on the {Workshop on Self-Awareness in Reconfigurable Computing Systems (SRCS'12)}", journal = j-TRETS, volume = "7", number = "2", pages = "11:1--11:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2611564", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Panerati:2014:CIL, author = "Jacopo Panerati and Martina Maggio and Matteo Carminati and Filippo Sironi and Marco Triverio and Marco D. Santambrogio", title = "Coordination of Independent Loops in Self-Adaptive Systems", journal = j-TRETS, volume = "7", number = "2", pages = "12:1--12:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2611563", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Nowadays, the same piece of code should run on different architectures, providing performance guarantees in a variety of environments and situations. To this end, designers often integrate existing systems with ad-hoc adaptive strategies able to tune specific parameters that impact performance or energy-for example, frequency scaling. However, these strategies interfere with one another and unpredictable performance degradation may occur due to the interaction between different entities. In this article, we propose a software approach to reconfiguration when different strategies, called loops, are encapsulated in the system and are available to be activated. Our solution to loop coordination is based on machine learning and it selects a policy for the activation of loops inside of a system without prior knowledge. We implemented our solution on top of GNU/Linux and evaluated it with a significant subset of the PARSEC benchmark suite.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Agne:2014:SAM, author = "Andreas Agne and Markus Happe and Achim L{\"o}sch and Christian Plessl and Marco Platzner", title = "Self-Awareness as a Model for Designing and Operating Heterogeneous Multicores", journal = j-TRETS, volume = "7", number = "2", pages = "13:1--13:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617596", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Self-aware computing is a paradigm for structuring and simplifying the design and operation of computing systems that face unprecedented levels of system dynamics and thus require novel forms of adaptivity. The generality of the paradigm makes it applicable to many types of computing systems and, previously, researchers started to introduce concepts of self-awareness to multicore architectures. In our work we build on a recent reference architectural framework as a model for self-aware computing and instantiate it for an FPGA-based heterogeneous multicore running the ReconOS reconfigurable architecture and operating system. After presenting the model for self-aware computing and ReconOS, we demonstrate with a case study how a multicore application built on the principle of self-awareness, autonomously adapts to changes in the workload and system state. Our work shows that the reference architectural framework as a model for self-aware computing can be practically applied and allows us to structure and simplify the design process, which is essential for designing complex future computing systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Beckhoff:2014:DTI, author = "Christian Beckhoff and Dirk Koch and Jim Torresen", title = "Design Tools for Implementing Self-Aware and Fault-Tolerant Systems on {FPGAs}", journal = j-TRETS, volume = "7", number = "2", pages = "14:1--14:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617597", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "To fully exploit the capabilities of runtime reconfigurable FPGAs in self-aware systems, design tools are required that exceed the capabilities of present vendor design tools. Such tools must allow the implementation of scalable reconfigurable systems with various different partial modules that might be loaded to different positions of the device at runtime. This comprises several complex tasks, including floorplanning, communication architecture synthesis, physical constraints generation, physical implementation, and timing verification all the way down to the final bitstream generation. In this article, we present how our GoAhead framework helps in implementing self-aware systems on FPGAs with a minimum of user interaction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Niu:2014:SAT, author = "Xinyu Niu and Qiwei Jin and Wayne Luk and Stephen Weston", title = "A Self-Aware Tuning and Self-Aware Evaluation Method for Finite-Difference Applications in Reconfigurable Systems", journal = j-TRETS, volume = "7", number = "2", pages = "15:1--15:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617598", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jun 30 18:26:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Finite-difference methods are computationally intensive and required by many applications. Parameters of a finite-difference algorithm, such as grid size, can be varied to generate design space which contains algorithm instances with different constant coefficients. An algorithm instance with specific coefficients can either be mapped into general operators to construct static designs, or be implemented as constant-specific operators to form dynamic designs, which require runtime reconfiguration to update algorithm coefficients. This article proposes a tuning method to explore the design space to optimise both the static and the dynamic designs, and an evaluation method to select the design with maximum overall throughput, based on algorithm characteristics, design properties, available resources and runtime data size. For benchmark applications option pricing and Reverse-Time Migration (RTM), over 50\% reduction in resource consumption has been achieved for both static designs and dynamic designs, while meeting precision requirements. For a single hardware implementation, the RTM design optimised with the proposed approach is expected to run 1.8 times faster than the best published design. The tuned static designs run thousands of times faster than the dynamic designs for algorithms with small data size, while the tuned dynamic designs achieve up to 5.9 times speedup over the corresponding static designs for large-scale finite-difference algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Laforest:2014:CMP, author = "Charles Eric Laforest and Zimo Li and Tristan O'rourke and Ming G. Liu and J. Gregory Steffan", title = "Composing Multi-Ported Memories on {FPGAs}", journal = j-TRETS, volume = "7", number = "3", pages = "16:1--16:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629629", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Multi-ported memories are challenging to implement on FPGAs since the block RAMs included in the fabric typically have only two ports. Hence we must construct memories requiring more than two ports, either out of logic elements or by combining multiple block RAMs. We present a thorough exploration and evaluation of the design space of FPGA-based soft multi-ported memories for conventional solutions, and also for the recently proposed Live Value Table (LVT) [LaForest and Steffan 2010] and XOR [LaForest et al. 2012] approaches to unidirectional port memories, reporting results for both Altera and Xilinx FPGAs. Additionally, we thoroughly evaluate and compare with a recent LVT-based approach to bidirectional port memories [Choi et al. 2012].", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Peng:2014:BAH, author = "Yuanxi Peng and Manuel Salda{\~n}a and Christopher A. Madill and Xiaofeng Zou and Paul Chow", title = "Benefits of Adding Hardware Support for Broadcast and Reduce Operations in {MPSoC} Applications", journal = j-TRETS, volume = "7", number = "3", pages = "17:1--17:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629470", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "MPI has been used as a parallel programming model for supercomputers and clusters and recently in MultiProcessor Systems-on-Chip (MPSoC). One component of MPI is collective communication and its performance is key for certain parallel applications to achieve good speedups. Previous work showed that, with synthetic communication-only benchmarks, communication improvements of up to 11.4-fold and 22-fold for broadcast and reduce operations, respectively, can be achieved by providing hardware support at the network level in a Network-on-Chip (NoC). However, these numbers do not provide a good estimation of the advantage for actual applications, as there are other factors that affect performance besides communications, such as computation. To this end, we extend our previous work by evaluating the impact of hardware support over a set of five parallel application kernels of varying computation-to-communication ratios. By introducing some useful computation to the performance evaluation, we obtain more representative results of the benefits of adding hardware support for broadcast and reduce operations. The experiments show that applications with lower computation-to-communication ratios benefit the most from hardware support as they highly depend on efficient collective communications to achieve better scalability. We also extend our work by doing more analysis on clock frequency, resource usage, power, and energy. The results show reasonable scalability for resource utilization and power in the network interfaces as the number of channels increases and that, even though more power is dissipated in the network interfaces due to the added hardware, the total energy used can still be less if the actual speedup is sufficient. The application kernels are executed in a 24-embedded-processor system distributed across four FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Anderson:2014:ISI, author = "Jason Anderson and Kiyoung Choi", title = "Introduction to the {Special Issue on the 11th International Conference on Field-Programmable Technology (FPT'12)}", journal = j-TRETS, volume = "7", number = "3", pages = "18:1--18:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2655712", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cheah:2014:IDB, author = "Hui Yan Cheah and Fredrik Brosser and Suhaib A. Fahmy and Douglas L. Maskell", title = "The {iDEA DSP} Block-Based Soft Processor for {FPGAs}", journal = j-TRETS, volume = "7", number = "3", pages = "19:1--19:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629443", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "DSP blocks in modern FPGAs can be used for a wide range of arithmetic functions, offering increased performance while saving logic resources for other uses. They have evolved to better support a plethora of signal processing tasks, meaning that in other application domains they may be underutilised. The DSP48E1 primitives in new Xilinx devices support dynamic programmability that can help extend their usefulness; the specific function of a DSP block can be modified on a cycle-by-cycle basis. However, the standard synthesis flow does not leverage this flexibility in the vast majority of cases. The lean DSP Extension Architecture (iDEA) presented in this article builds around the dynamic programmability of a single DSP48E1 primitive, with minimal additional logic to create a general-purpose processor supporting a full instruction-set architecture. The result is a very compact, fast processor that can execute a full gamut of general machine instructions. We show a number of simple applications compiled using an MIPS compiler and translated to the iDEA instruction set, comparing with a Xilinx MicroBlaze to show estimated performance figures. Being based on the DSP48E1, this processor can be deployed across next-generation Xilinx Artix-7, Kintex-7, Virtex-7, and Zynq families.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abdelfattah:2014:NCF, author = "Mohamed S. Abdelfattah and Vaughn Betz", title = "Networks-on-Chip for {FPGAs}: Hard, Soft or Mixed?", journal = j-TRETS, volume = "7", number = "3", pages = "20:1--20:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629442", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "As FPGA capacity increases, a growing challenge is connecting ever-more components with the current low-level FPGA interconnect while keeping designers productive and on-chip communication efficient. We propose augmenting FPGAs with networks-on-chip (NoCs) to simplify design, and we show that this can be done while maintaining or even improving silicon efficiency. We compare the area and speed efficiency of each NoC component when implemented hard versus soft to explore the space and inform our design choices. We then build on this component-level analysis to architect hard NoCs and integrate them into the FPGA fabric; these NoCs are on average 20--23$ \times $ smaller and 5--6$ \times $ faster than soft NoCs. A 64-node hard NoC uses only ~2\% of an FPGA's silicon area and metallization. We introduce a new communication efficiency metric: silicon area required per realized communication bandwidth. Soft NoCs consume 4960 mm$^2$ /TBps, but hard NoCs are 84$ \times $ more efficient at 59 mm$^2$ /TBps. Informed design can further reduce the area overhead of NoCs to 23 mm$^2$ /TBps, which is only 2.6$ \times $ less efficient than the simplest point-to-point soft links (9 mm$^2$ /TBps). Despite this almost comparable efficiency, NoCs can switch data across the entire FPGA while point-to-point links are very limited in capability; therefore, hard NoCs are expected to improve FPGA efficiency for more complex styles of communication.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2014:GMA, author = "Liang Chen and Tulika Mitra", title = "Graph Minor Approach for Application Mapping on {CGRAs}", journal = j-TRETS, volume = "7", number = "3", pages = "21:1--21:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2655242", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Coarse-Grained Reconfigurable Arrays (CGRAs) exhibit high performance, improved flexibility, low cost, and power efficiency for various application domains. Compute-intensive loop kernels, which are perfect candidates to be executed on CGRAs, are usually mapped through modified modulo scheduling algorithms. These algorithms should be capable of performing both placement and routing. We formalize the CGRA mapping problem as a graph minor containment problem. We essentially test whether the dataflow graph representing the loop kernel is a minor of the modulo routing resource graph representing the CGRA resources and their interconnects. We design an exact graph minor testing approach that exploits the unique properties of both the dataflow graph and the routing resource graph to significantly prune the search space. We introduce additional heuristic strategies that drastically improve the compilation time while still generating optimal or near-optimal mapping solutions. Experimental evaluation confirms the efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kim:2014:USU, author = "Changmoo Kim and Mookyoung Chung and Yeongon Cho and Mario Konijnenburg and Soojung Ryu and Jeongwook Kim", title = "{ULP-SRP}: Ultra Low-Power {Samsung} Reconfigurable Processor for Biomedical Applications", journal = j-TRETS, volume = "7", number = "3", pages = "22:1--22:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629610", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The latest biomedical applications require low energy consumption, high performance, and wide energy-performance scalability to adapt to various working environments. In this study, we present ULP-SRP, an energy-efficient reconfigurable processor for biomedical applications. ULP-SRP uses a Coarse-Grained Reconfigurable Array (CGRA) for high-performance data processing with low energy consumption. We adopted a compact-size CGRA and modified it to support dynamically switchable three performance modes with fine-grained power gating in order to further optimize the energy consumption. The energy-performance scalability is also accomplished with multiple performance modes and a Unified Memory Architecture (UMA). Experimental results show that ULP-SRP achieved 59\% energy reduction compared to previous works. A technique of dynamic CGRA mode changing gives 18.9\% energy reduction. ULP-SRP is a good candidate for future mobile healthcare devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Voros:2014:ISI, author = "Nikolaos Voros and Guy Gogniat", title = "Introduction to the Special Issue on the {7th International Workshop on Reconfigurable Communication-centric Systems-on-Chip (ReCoSoC'12)}", journal = j-TRETS, volume = "7", number = "3", pages = "23:1--23:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2655710", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Brugger:2014:RRF, author = "Christian Brugger and Dominic Hillenbrand and Matthias Balzer", title = "{RIVER}: Reconfigurable Flow and Fabric for Real-Time Signal Processing on {FPGAs}", journal = j-TRETS, volume = "7", number = "3", pages = "24:1--24:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2655238", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "For high-performance embedded hard-real-time systems, ASICs and FPGAs hold advantages over general-purpose processors and graphics accelerators (GPUs). However, developing signal processing architectures from scratch requires significant resources. Our design methodology is based on sets of configurable building blocks that provide storage, dataflow, computation, and control. Based on our building blocks, we generate hundreds of thousands of our dynamic streaming engine processors that we call DSEs. We store our DSEs in a repository that can be queried for (online) design space exploration. From this repository, DSEs can be downloaded and instantiated within milliseconds on FPGAs. If a loss of flexibility can be tolerated then ASIC implementations are feasible as well. In this article we focus on FPGA implementations. Our DSEs vary in cores, computational lanes, bitwidths, power consumption, and frequency. To the best of our knowledge we are the first to propose online design space exploration based on repositories of precompiled cores that are assembled of common building blocks. For demonstration purposes we map algorithms for image processing and financial mathematics to DSEs and compare the performance to existing highly optimized signal and graphics accelerators.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Itturiet:2014:APE, author = "F{\'a}bio Itturiet and Gabriel Nazar and Ronaldo Ferreira and {\'A}lvaro Moreira and Luigi Carro", title = "Adaptive Parallelism Exploitation under Physical and Real-Time Constraints for Resilient Systems", journal = j-TRETS, volume = "7", number = "3", pages = "25:1--25:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2556943", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article introduces the resilient adaptive algebraic architecture that aims at adapting parallelism exploitation of a matrix multiplication algorithm in a time-deterministic fashion to reduce power consumption while meeting real-time deadlines present in most DSP-like applications. The proposed architecture provides low-overhead error correction capabilities relying on the hardware implementation of the algorithm-based fault-tolerance method that is executed concurrently with matrix multiplication, providing efficient occupation of memory and power resources. The Resilient Adaptive Algebraic Architecture (RA$^3$ ) is evaluated using three real-time industrial case studies from the telecom and multimedia application domains to present the design space exploration and the adaptation possibilities the architecture offers to hardware designers. RA$^3$ is compared in its performance and energy efficiency with standard high-performance architectures, namely a GPU and an out-of-order general-purpose processor. Finally, we present the results of fault injection campaigns in order to measure the architecture resilience to soft errors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lam:2014:EFA, author = "Siew-Kei Lam and Christopher T. Clarke and Thambipillai Srikanthan", title = "Exploiting {FPGA}-Aware Merging of Custom Instructions for Runtime Reconfiguration", journal = j-TRETS, volume = "7", number = "3", pages = "26:1--26:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2655240", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Runtime reconfiguration is a promising solution for reducing hardware cost in embedded systems, without compromising on performance. We present a framework that aims to increase the performance benefits of reconfigurable processors that support full or partial runtime reconfiguration. The proposed framework achieves this by: (1) providing a means for choosing suitable custom instruction selection heuristics, (2) leveraging FPGA-aware merging of custom instructions to maximize the reconfigurable logic block utilization in each configuration, and (3) incorporating a hierarchical loop partitioning strategy to reduce runtime reconfiguration overhead. We show that the performance gain can be improved by employing suitable custom instruction selection heuristics that, in turn, depend on the reconfigurable resource constraints and the merging factor (extent to which the selected custom instructions can be merged). The hierarchical loop partitioning strategy leads to an average performance gain of over 31\% and 46\% for full and partial runtime reconfiguration, respectively. Performance gain can be further increased to over 52\% and 70\% for full and partial runtime reconfiguration, respectively, by exploiting FPGA-aware merging of custom instructions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Guillet:2014:EUM, author = "S{\'e}bastien Guillet and Florent de Lamotte and Nicolas le Griguer and {\'E}ric Rutten and Guy Gogniat and Jean-Philippe Diguet", title = "Extending {UML\slash MARTE} to Support Discrete Controller Synthesis, Application to Reconfigurable Systems-on-Chip Modeling", journal = j-TRETS, volume = "7", number = "3", pages = "27:1--27:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629628", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Sep 1 10:42:23 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents the first framework to design and synthesize a formal controller managing dynamic reconfiguration, using a model-driven engineering methodology based on an extension of UML/MARTE. The implementation technique highlights the combination of hard configuration constraints using weights ( control part )-ensured statically and fulfilled by the system at runtime-and soft constraints ( decision part ) that, given a set of correct and accessible configurations, choose one of them. An application model of an image processing application is presented, then transformed and synthesized to be executed on a Xilinx platform to show how the controller, executed on a Microblaze, manages the hardware reconfigurations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Butler:2015:HSH, author = "Jon T. Butler and Tsutomu Sasao", title = "High-Speed Hardware Partition Generation", journal = j-TRETS, volume = "7", number = "4", pages = "1:1--1:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629472", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We demonstrate circuits that generate set and integer partitions on a set S of n objects at a rate of one per clock. Partitions are ways to group elements of a set together and have been extensively studied by researchers in algorithm design and theory. We offer two versions of a hardware set partition generator. In the first, partitions are produced in lexicographical order in response to successive clock pulses. In the second, an index input determines the set partition produced. Such circuits are useful in the hardware implementation of the optimum distribution of tasks to processors. We show circuits for integer partitions as well. Our circuits are combinational. For large n, they can have a large delay. However, one can easily pipeline them to produce one partition per clock period. We show (1) analytical and (2) experimental time/complexity results that quantify the efficiency of our designs. For example, our results show that a hardware set partition generator running on a 100MHz FPGA produces partitions at a rate that is approximately 10 times the rate of a software implementation on a processor running at 2.26GHz.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Paulino:2015:RAB, author = "Nuno Paulino and Jo{\~a}o Canas Ferreira and Jo{\~a}o M. P. Cardoso", title = "A Reconfigurable Architecture for Binary Acceleration of Loops with Memory Accesses", journal = j-TRETS, volume = "7", number = "4", pages = "2:1--2:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629468", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents a reconfigurable hardware/software architecture for binary acceleration of embedded applications. A Reconfigurable Processing Unit (RPU) is used as a coprocessor of the General Purpose Processor (GPP) to accelerate the execution of repetitive instruction sequences called Megablocks. A toolchain detects Megablocks from instruction traces and generates customized RPU implementations. The implementation of Megablocks with memory accesses uses a memory-sharing mechanism to support concurrent accesses to the entire address space of the GPP's data memory. The scheduling of load/store operations and memory access handling have been optimized to minimize the latency introduced by memory accesses. The system is able to dynamically switch the execution between the GPP and the RPU when executing the original binaries of the input application. Our proof-of-concept prototype achieved geometric mean speedups of 1.60$ \times $ and 1.18$ \times $ for, respectively, a set of 37 benchmarks and a subset considering the 9 most complex benchmarks. With respect to a previous version of our approach, we achieved geometric mean speedup improvements from 1.22 to 1.53 for the 10 benchmarks previously used.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dhawan:2015:AEN, author = "Udit Dhawan and Andr{\'e} Dehon", title = "Area-Efficient Near-Associative Memories on {FPGAs}", journal = j-TRETS, volume = "7", number = "4", pages = "3:1--3:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629471", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Associative memories can map sparsely used keys to values with low latency but can incur heavy area overheads. The lack of customized hardware for associative memories in today's mainstream FPGAs exacerbates the overhead cost of building these memories using the fixed address match BRAMs. In this article, we develop a new, FPGA-friendly, memory system architecture based on a multiple hash scheme that is able to achieve near-associative performance without the area-delay overheads of a fully associative memory on FPGAs. At the same time, we develop a novel memory management algorithm that allows us to statistically mimic an associative memory. Using the proposed architecture as a 64KB L1 data cache, we show that it is able to achieve near-associative miss rates while consuming 3--13 $ \times $ fewer FPGA memory resources for a set of benchmark programs from the SPEC CPU2006 suite than fully associative memories generated by the Xilinx Coregen tool. Benefits for our architecture increase with key width, allowing area reduction up to 100 $ \times $. Mapping delay is also reduced to 3.7ns for a 1,024-entry flat version or 6.1ns for an area-efficient version compared to 17.6ns for a fully associative memory for a 64-bit key on a Xilinx Virtex 6 device.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Llamocca:2015:DEP, author = "Daniel Llamocca and Marios Pattichis", title = "Dynamic Energy, Performance, and Accuracy Optimization and Management Using Automatically Generated Constraints for Separable {$2$D} {FIR} Filtering for Digital Video Processing", journal = j-TRETS, volume = "7", number = "4", pages = "4:1--4:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629623", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "There is strong interest in the development of dynamically reconfigurable systems that can meet real-time constraints on energy, performance, and accuracy. The generation of real-time constraints will significantly expand the applicability of dynamically reconfigurable systems to new domains, such as digital video processing. We develop a dynamically reconfigurable 2D FIR filtering system that can meet real-time constraints in energy, performance, and accuracy (EPA). The real-time constraints are automatically generated based on user input, image types associated with video communications, and video content. We first generate a set of Pareto-optimal realizations, described by their EPA values and associated 2D FIR hardware description bitstreams. Dynamic management is then achieved by selecting Pareto-optimal realizations that meet the automatically generated time-varying EPA constraints. We validate our approach using three different 2D Gaussian filters. Filter realizations are evaluated in terms of the required energy per frame, accuracy of the resulting image, and performance in frames per second. We demonstrate dynamic EPA management by applying a Difference of Gaussians (DOG) filter to standard video sequences. For video frame sizes that are equal to or larger than the VGA resolution, compared to a static implementation, our dynamic system provides significant reduction in the total energy consumption ({$>$30}\%).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gojman:2015:GLG, author = "Benjamin Gojman and Sirisha Nalmela and Nikil Mehta and Nicholas Howarth and Andr{\'e} Dehon", title = "{GROK-LAB}: Generating Real On-chip Knowledge for Intra-cluster Delays Using Timing Extraction", journal = j-TRETS, volume = "7", number = "4", pages = "5:1--5:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2597889", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Timing Extraction identifies the delay of fine-grained components within an FPGA. From these computed delays, the delay of any path can be calculated. Moreover, a comparison of the fine-grained delays allows a detailed understanding of the amount and type of process variation that exists in the FPGA. To obtain these delays, Timing Extraction measures, using only resources already available in the FPGA, the delay of a small subset of the total paths in the FPGA. We apply Timing Extraction to the Logic Array Block (LAB) on an Altera Cyclone III FPGA to obtain a view of the delay down to near-individual LUT SRAM cell granularity, characterizing components with delays on the order of tens to a few hundred picoseconds with a resolution of $ \pm {}3.2 $ ps, matching the expected error bounds. This information reveals that the 65nm process used has, on average, random variation of $ \sigma \mu = 4.0 \% $ with components having an average maximum spread of 83ps. Timing Extraction also shows that as $ V_{DD} $ decreases from 1.2V to 0.9V in a Cyclone IV 60nm FPGA, paths slow down, and variation increases from $ \sigma \mu = 4.3 \% $ to $ \sigma \mu = 5.8 \% $, a clear indication that lowering $ V_{DD} $ magnifies the impact of random variation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mahram:2015:NBH, author = "Atabak Mahram and Martin C. Herbordt", title = "{NCBI BLASTP} on High-Performance Reconfigurable Computing Systems", journal = j-TRETS, volume = "7", number = "4", pages = "6:1--6:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629691", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The BLAST sequence alignment program is a central application in bioinformatics. The de facto standard version, NCBI BLAST, uses complex heuristics that make it challenging to simultaneously achieve both high performance and exact agreement. We propose a system that uses novel FPGA-based filters that reduce the input database by over 99.97\% without loss of sensitivity. There are several contributions. First is design of the filters themselves, which perform two-hit seeding, exhaustive ungapped alignment, and exhaustive gapped alignments, respectively. Second is the coupling of the filters, especially the two-hit seeding and the ungapped alignment. Third is pipelining the filters in a single design, including maintaining load balancing as data are reduced by orders of magnitude at each stage. Fourth is the optimization required to maintain operating frequency for the resulting complex design. And finally, there is system integration both in hardware (the Convey HC1-EX) and software (NCBI BLASTP). We present results for various usage scenarios and find complete agreement and a factor of nearly 5x speedup over a fully parallel implementation of the reference code on a contemporaneous CPU. We believe that the resulting system is the leading per-socket-accelerated NCBI BLAST.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Swierczynski:2015:PSE, author = "Pawel Swierczynski and Amir Moradi and David Oswald and Christof Paar", title = "Physical Security Evaluation of the Bitstream Encryption Mechanism of {Altera Stratix II} and {Stratix III} {FPGAs}", journal = j-TRETS, volume = "7", number = "4", pages = "7:1--7:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629462", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "To protect Field-Programmable Gate Array (FPGA) designs against Intellectual Property (IP) theft and related issues such as product cloning, all major FPGA manufacturers offer a mechanism to encrypt the bitstream that is used to configure the FPGA. From a mathematical point of view, the employed encryption algorithms (e.g., Advanced Encryption Standard (AES) or 3DES) are highly secure. However, it has been shown that the bitstream encryption feature of several FPGA families is susceptible to side-channel attacks based on measuring the power consumption of the cryptographic module. In this article, we present the first successful attack on the bitstream encryption of the Altera Stratix II and Stratix III FPGA families. To this end, we analyzed the Quartus II software and reverse engineered the details of the proprietary and unpublished schemes used for bitstream encryption on Stratix II and Stratix III. Using this knowledge, we demonstrate that the full 128-bit AES key of a Stratix II as well as the full 256-bit AES key of a Stratix III can be recovered by means of side-channel attacks. In both cases, the attack can be conducted in a few hours. The complete bitstream of these FPGAs that are (seemingly) protected by the bitstream encryption feature can hence fall into the hands of a competitor or criminal-possibly implying system-wide damage if confidential information such as proprietary encryption schemes or secret keys programmed into the FPGA are extracted. In addition to lost IP, reprogramming the attacked FPGA with modified code, for instance, to secretly plant a hardware Trojan, is a particularly dangerous scenario for many security-critical applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vliegen:2015:SRD, author = "Jo Vliegen and Nele Mentens and Ingrid Verbauwhede", title = "Secure, Remote, Dynamic Reconfiguration of {FPGAs}", journal = j-TRETS, volume = "7", number = "4", pages = "8:1--8:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629423", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "With the widespread availability of broadband Internet, Field-Programmable Gate Arrays (FPGAs) can get remote updates in the field. This provides hardware and software updates, and enables issue solving and upgrade ability without device modification. In order to prevent an attacker from eavesdropping or manipulating the configuration data, security is a necessity. This work describes an architecture that allows the secure, remote reconfiguration of an FPGA. The architecture is partially dynamically reconfigurable and it consists of a static partition that handles the secure communication protocol and a single reconfigurable partition that holds the main application. Our solution distinguishes itself from existing work in two ways: it provides entity authentication and it avoids the use of a trusted third party. The former provides protection against active attackers on the communication channel, while the latter reduces the number of reliable entities. Additionally, this work provides basic countermeasures against simple power-oriented side-channel analysis attacks. The result is an implementation that is optimized toward minimal resource occupation. Because configuration updates occur infrequently, configuration speed is of minor importance with respect to area. A prototype of the proposed design is implemented, using 5,702 slices and having minimal downtime.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chau:2015:MAP, author = "Thomas C. P. Chau and Xinyu Niu and Alison Eele and Jan Maciejowski and Peter Y. K. Cheung and Wayne Luk", title = "Mapping Adaptive Particle Filters to Heterogeneous Reconfigurable Systems", journal = j-TRETS, volume = "7", number = "4", pages = "9:1--9:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629469", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents an approach for mapping real-time applications based on particle filters (PFs) to heterogeneous reconfigurable systems, which typically consist of multiple FPGAs and CPUs. A method is proposed to adapt the number of particles dynamically and to utilise runtime reconfigurability of FPGAs for reduced power and energy consumption. A data compression scheme is employed to reduce communication overhead between FPGAs and CPUs. A mobile robot localisation and tracking application is developed to illustrate our approach. Experimental results show that the proposed adaptive PF can reduce up to 99\% of computation time. Using runtime reconfiguration, we achieve a 25\% to 34\% reduction in idle power. A 1U system with four FPGAs is up to 169 times faster than a single-core CPU and 41 times faster than a 1U CPU server with 12 cores. It is also estimated to be 3 times faster than a system with four GPUs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Miller:2015:GBA, author = "Bailey Miller and Frank Vahid and Tony Givargis and Philip Brisk", title = "Graph-Based Approaches to Placement of Processing Element Networks on {FPGAs} for Physical Model Simulation", journal = j-TRETS, volume = "7", number = "4", pages = "10:1--10:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629521", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Feb 13 07:24:19 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Physical models utilize mathematical equations to characterize physical systems like airway mechanics, neuron networks, or chemical reactions. Previous work has shown that field programmable gate arrays (FPGAs) execute physical models efficiently. To improve the implementation of physical models on FPGAs, this article leverages graph theoretic techniques to synthesize physical models onto FPGAs. The first phase maps physical model equations onto a structured virtual processing element (PE) graph using graph theoretic folding techniques. The second phase maps the structured virtual PE graph onto physical PE regions on an FPGA using graph embedding theory. A simulated annealing algorithm is introduced that can map any physical model onto an FPGA regardless of the model's underlying topology. We further extend the simulated annealing approach by leveraging existing graph drawing algorithms to generate the initial placement. Compared to previous work on physical model implementation on FPGAs, embedding increases clock frequency by 25\% on average (for applicable topologies), whereas simulated annealing increases frequency by 13\% on average. The embedding approach typically produces a circuit whose frequency is limited by the FPGA clock instead of routing. Additionally, complex models that could not previously be routed due to complexity were made routable when using placement constraints.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{DiCarlo:2015:SSA, author = "Stefano {Di Carlo} and Giulio Gambardella and Paolo Prinetto and Daniele Rolfo and Pascal Trotta", title = "{SATTA}: a {Self-Adaptive Temperature-Based TDF Awareness} Methodology for Dynamically Reconfigurable {FPGAs}", journal = j-TRETS, volume = "8", number = "1", pages = "1:1--1:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2659001", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 7 16:45:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Dependability issues due to nonfunctional properties are emerging as a major cause of faults in modern digital systems. Effective countermeasures have to be developed to properly manage their critical timing effects. This article presents a methodology to avoid transition delay faults in field-programmable gate array (FPGA)-based systems, with low area overhead. The approach is able to exploit temperature information and aging characteristics to minimize the cost in terms of performances degradation and power consumption. The architecture of a hardware manager able to avoid delay faults is presented and analyzed extensively, as well as its integration in the standard implementation design flow.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cooke:2015:TAF, author = "Patrick Cooke and Jeremy Fowers and Greg Brown and Greg Stitt", title = "A Tradeoff Analysis of {FPGAs}, {GPUs}, and Multicores for Sliding-Window Applications", journal = j-TRETS, volume = "8", number = "1", pages = "2:1--2:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2659000", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 7 16:45:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The increasing usage of hardware accelerators such as Field-Programmable Gate Arrays (FPGAs) and Graphics Processing Units (GPUs) has significantly increased application design complexity. Such complexity results from a larger design space created by numerous combinations of accelerators, algorithms, and hw/sw partitions. Exploration of this increased design space is critical due to widely varying performance and energy consumption for each accelerator when used for different application domains and different use cases. To address this problem, numerous studies have evaluated specific applications across different architectures. In this article, we analyze an important domain of applications, referred to as sliding-window applications, implemented on FPGAs, GPUs, and multicore CPUs. For each device, we present optimization strategies and analyze use cases where each device is most effective. The results show that, for large input sizes, FPGAs can achieve speedups of up to $ 5.6 \times $ and $ 58 \times $ compared to GPUs and multicore CPUs, respectively, while also using up to an order of magnitude less energy. For small input sizes and applications with frequency-domain algorithms, GPUs generally provide the best performance and energy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Quinn:2015:CFE, author = "Heather Quinn and Diane Roussel-Dupre and Mike Caffrey and Paul Graham and Michael Wirthlin and Keith Morgan and Anthony Salazar and Tony Nelson and Will Howes and Eric Johnson and Jon Johnson and Brian Pratt and Nathan Rollins and Jim Krone", title = "The {Cibola Flight Experiment}", journal = j-TRETS, volume = "8", number = "1", pages = "3:1--3:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629556", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 7 16:45:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Over the past 15 years many organizations have researched the use of Static-Random Access Memory (SRAM)-based Field-Programmable Gate Arrays (FPGAs) in space. Although the components can provide a performance improvement over radiation-hardened processing components, random soft errors can occur from the naturally occurring space radiation environment. Many organizations have been developing methods for characterizing, emulating, and simulating radiation-induced events; mitigating and removing radiation-induced computational errors; and designing fault-tolerant reconfigurable spacecraft. Los Alamos National Laboratory has fielded one of the longest space-based FPGAs experiments, called the Cibola Flight Experiment (CFE), using Xilinx Virtex FPGAs. CFE has successfully deployed commercial SRAM FPGAs into a low-Earth orbit with Single-Event Upset (SEU) mitigation and was able to exploit effectively the reconfigurability and customization of FPGAs in a harsh radiation environment. Although older than current state-of-the-art FPGAs, these same concepts are used to deploy newer FPGA-based space systems since the launch of the CFE satellite and will continue to be useful for newer systems. In this article, we present how the system was designed to be fault tolerant, prelaunch predictions of expected on-orbit behaviors, and on-orbit results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Davidson:2015:IDC, author = "Tom Davidson and Elias Vansteenkiste and Karel Heyse and Karel Bruneel and Dirk Stroobandt", title = "Identification of Dynamic Circuit Specialization Opportunities in {RTL} Code", journal = j-TRETS, volume = "8", number = "1", pages = "4:1--4:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629640", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 7 16:45:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Dynamic Circuit Specialization (DCS) optimizes a Field-Programmable Gate Array (FPGA) design by assuming a set of its input signals are constant for a reasonable amount of time, leading to a smaller and faster FPGA circuit. When the signals actually change, a new circuit is loaded into the FPGA through runtime reconfiguration. The signals the design is specialized for are called parameters. For certain designs, parameters can be selected so the DCS implementation is both smaller and faster than the original implementation. However, DCS also introduces an overhead that is difficult for the designer to take into account, making it hard to determine whether a design is improved by DCS or not. This article presents extensive results on a profiling methodology that analyses Register-Transfer Level (RTL) implementations of applications to check if DCS would be beneficial. It proposes to use the functional density as a measure for the area efficiency of an implementation, as this measure contains both the overhead and the gains of a DCS implementation. The first step of the methodology is to analyse the dynamic behaviour of signals in the design, to find good parameter candidates. The overhead of DCS is highly dependent on this dynamic behaviour. A second stage calculates the functional density for each candidate and compares it to the functional density of the original design. The profiling methodology resulted in three implementations of a profiling tool, the DCS-RTL profiler. The execution time, accuracy, and the quality of each implementation is assessed based on data from 10 RTL designs. All designs, except for the two 16-bit adaptable Finite Impulse Response (FIR) filters, are analysed in 1 hour or less.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Iturbe:2015:MAH, author = "Xabier Iturbe and Khaled Benkrid and Chuan Hong and Ali Ebrahim and Raul Torrego and Tughrul Arslan", title = "Microkernel Architecture and Hardware Abstraction Layer of a Reliable Reconfigurable Real-Time Operating System {(R3TOS)}", journal = j-TRETS, volume = "8", number = "1", pages = "5:1--5:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629639", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 7 16:45:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents a new solution for easing the development of reconfigurable applications using Field-Programable Gate Arrays (FPGAs). Namely, our Reliable Reconfigurable Real-Time Operating System (R3TOS) provides OS-like support for partially reconfigurable FPGAs. Unlike related works, R3TOS is founded on the basis of resource reusability and computation ephemerality. It makes intensive use of reconfiguration at very fine FPGA granularity, keeping the logic resources used only while performing computation and releasing them as soon as it is completed. To achieve this goal, R3TOS goes beyond the traditional approach of using reconfigurable slots with fixed boundaries interconnected by means of a static communication infrastructure. Instead, R3TOS approaches a static route-free system where nearly everything is reconfigurable. The tasks are concatenated to form a computation chain through which partial results naturally flow, and data are exchanged among remotely located tasks using FPGA's reconfiguration mechanism or by means of ``removable'' routing circuits. In this article, we describe the R3TOS microkernel architecture as well as its hardware abstraction services and programming interface. Notably, the article presents a set of novel circuits and mechanisms to overcome the limitations and exploit the opportunities of Xilinx reconfigurable technology in the scope of hardware multitasking and dependability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shi:2015:IDD, author = "Kan Shi and David Boland and George A. Constantinides", title = "Imprecise Datapath Design: an Overclocking Approach", journal = j-TRETS, volume = "8", number = "2", pages = "6:1--6:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629527", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we describe an alternative circuit design methodology when considering trade-offs between accuracy, performance, and silicon area. We compare two different approaches that could trade accuracy for performance. One is the traditional approach where the precision used in the datapath is limited to meet a target latency. The other is a proposed new approach which simply allows the datapath to operate without timing closure. We demonstrate analytically and experimentally that on average our approach obtains either smaller errors or equivalent faster operating frequencies in comparison to the traditional approach. This is because the worst case caused by timing violations only happens rarely, while precision loss results in errors to most data. We also show that for basic arithmetic operations such as addition, applying our approach to the simple building block of ripple carry adders can achieve better accuracy or performance than using faster adder designs to achieve similar latency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Woods:2015:PDP, author = "Louis Woods and Gustavo Alonso and Jens Teubner", title = "Parallelizing Data Processing on {FPGAs} with Shifter Lists", journal = j-TRETS, volume = "8", number = "2", pages = "7:1--7:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629551", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Parallelism is currently seen as a mechanism to minimize the impact of the power and heat dissipation problems encountered in modern hardware. Data parallelism-based on partitioning the data-and pipeline parallelism-based on partitioning the computation-are the two main approaches to leverage parallelism on a wide range of hardware platforms. Unfortunately, not all data processing problems are susceptible to either of those strategies. An example is the skyline operator [B{\"o}rzs{\"o}nyi et al. 2001], which computes the set of Pareto-optimal points within a multidimensional dataset. Existing approaches to parallelize the skyline operator are based on data parallelism. As a result, they suffer from a high overhead when merging intermediate results because of the lack of a global view of the problem inherent to partitioning the input data. In this article, we show how to combine pipeline with data parallelism on a Field-Programmable Gate Array (FPGA) for a more efficient utilization of the available hardware parallelism. As we show in our experiments, skyline computation using our proposed technique scales linearly with the number of processing elements, and the performance we achieve on a rather small FPGA is comparable to that of a 64-core high-end server running a state-of-the-art data parallel implementation of skyline [Park et al. 2009]. The proposed approach to parallelize the skyline operator can be generalized to a wider range of data processing problems. We demonstrate this through a novel, highly parallel data structure, a shifter list, that can be efficiently implemented on an FPGA. The resulting template is easy to parametrize to implement a variety of computationally intensive operators such as frequent items, n -closest pairs, or K-means.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cardoso:2015:GEF, author = "Jo{\~a}o M. P. Cardoso and Pedro C. Diniz and Katherine (Compton) Morrow", title = "Guest Editorial: {FPL 2013}", journal = j-TRETS, volume = "8", number = "2", pages = "8:1--8:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2737805", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ferreira:2015:RFP, author = "Ricardo Ferreira and Luciana Rocha and Andr{\'e} G. Santos and Jos{\'e} A. M. Nacif and Stephan Wong and Luigi Carro", title = "A Runtime {FPGA} Placement and Routing Using Low-Complexity Graph Traversal", journal = j-TRETS, volume = "8", number = "2", pages = "9:1--9:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2660775", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Dynamic Partial Reconfiguration (DPaR) enables efficient allocation of logic resources by adding new functionalities or by sharing and/or multiplexing resources over time. Placement and routing (P\&R) is one of the most time-consuming steps in the DPaR flow. P\&R are two independent NP-complete problems, and, even for medium size circuits, traditional P\&R algorithms are not capable of placing and routing hardware modules at runtime. We propose a novel runtime P\&R algorithm for Field-Programmable Gate Array (FPGA)-based designs. Our algorithm models the FPGA as an implicit graph with a direct correspondence to the target FPGA. The P\&R is performed as a graph mapping problem by exploring the node locality during a depth-first traversal. We perform the P\&R using a greedy heuristic that executes in polynomial time. Unlike state-of-the-art algorithms, our approach does not try similar solutions, thus allowing the P\&R to execute in milliseconds. Our algorithm is also suitable for P\&R in fragmented regions. We generate results for a manufacturer-independent virtual FPGA. Compared with the most popular P\&R tool running the same benchmark suite, our algorithm is up to three orders of magnitude faster.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Murray:2015:TDT, author = "Kevin E. Murray and Scott Whitty and Suya Liu and Jason Luu and Vaughn Betz", title = "Timing-Driven {Titan}: Enabling Large Benchmarks and Exploring the Gap between Academic and Commercial {CAD}", journal = j-TRETS, volume = "8", number = "2", pages = "10:1--10:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629579", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Benchmarks play a key role in Field-Programmable Gate Array (FPGA) architecture and CAD research, enabling the quantitative comparison of tools and architectures. It is important that these benchmarks reflect modern large-scale systems that make use of heterogeneous resources; however, most current FPGA benchmarks are both small and simple. In this artile, we present Titan, a hybrid CAD flow that addresses these issues. The flow uses Altera's Quartus II FPGA CAD software to perform HDL synthesis and a conversion tool to translate the result into the academic Berkeley Logic Interchange Format (BLIF). Using this flow, we created the Titan23 benchmark set, which consists of 23 large (90K--1.8M block) benchmark circuits covering a wide range of application domains. Using the Titan23 benchmarks and an enhanced model of Altera's Stratix IV architecture, including a detailed timing model, we compare the performance and quality of VPR and Quartus II targeting the same architecture. We found that VPR is at least $ 2.8 \times $ slower, uses $ 6.2 \times $ more memory, $ 2.2 \times $ more wire, and produces critical paths $ 1.5 \times $ slower compared to Quartus II. Finally, we identified that VPR's focus on achieving a dense packing and an inability to take apart clusters is responsible for a large portion of the wire length and critical path delay gap.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gan:2015:SGA, author = "Lin Gan and Haohuan Fu and Wayne Luk and Chao Yang and Wei Xue and Xiaomeng Huang and Youhui Zhang and Guangwen Yang", title = "Solving the Global Atmospheric Equations through Heterogeneous Reconfigurable Platforms", journal = j-TRETS, volume = "8", number = "2", pages = "11:1--11:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629581", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "One of the most essential and challenging components in climate modeling is the atmospheric model. To solve multiphysical atmospheric equations, developers have to face extremely complex stencil kernels that are costly in terms of both computing and memory resources. This article aims to accelerate the solution of global shallow water equations (SWEs), which is one of the most essential equation sets describing atmospheric dynamics. We first design a hybrid methodology that employs both the host CPU cores and the field-programmable gate array (FPGA) accelerators to work in parallel. Through a careful adjustment of the computational domains, we achieve a balanced resource utilization and a further improvement of the overall performance. By decomposing the resource-demanding SWE kernel, we manage to map the double-precision algorithm into three FPGAs. Moreover, by using fixed-point and reduced-precision floating point arithmetic, we manage to build a fully pipelined mixed-precision design on a single FPGA, which can perform 428 floating-point and 235 fixed-point operations per cycle. The mixed-precision design with four FPGAs running together can achieve a speedup of 20 over a fully optimized design on a CPU rack with two eight-core processors and is 8 times faster than the fully optimized Kepler GPU design. As for power efficiency, the mixed-precision design with four FPGAs is 10 times more power efficient than a Tianhe-1A supercomputer node.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Das:2015:ASE, author = "Anup Das and Shyamsundar Venkataraman and Akash Kumar", title = "Autonomous Soft-Error Tolerance of {FPGA} Configuration Bits", journal = j-TRETS, volume = "8", number = "2", pages = "12:1--12:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629580", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Field-programmable gate arrays (FPGAs) are increasingly susceptible to radiation-induced single event upsets (SEUs). These upsets are predominant in a space environment; however, with increasing use of static RAM (SRAM) in modern FPGAs, these SEUs are gaining prominence even in a terrestrial environment. SEUs can flip SRAM bits of FPGA, potentially altering the functionality of the implemented design. This has motivated FPGA designers to investigate techniques to protect the FPGA configuration bits against such inadvertent bit flips (soft error). Traditionally, triple modular redundancy (TMR) is used to protect the FPGA bit flips. Increasing design complexity and limited battery life motivate for alternative approaches for soft-error tolerance. In this article, we propose a technique to improve autonomous fault-masking capabilities of a design by maximizing the number of zeros or ones in lookup tables (LUTs). The technique analyzes critical configuration bits and utilizes spare resources (XOR gates and carry chains) of FPGAs to selectively manipulate the logic implemented in LUTs using two operations: LUT restructuring and LUT decomposition. We implemented the proposed approach for Xilinx Virtex-6 FPGAs and validated the same with a wide set of designs from the MCNC, IWLS 2005, and ITC99 benchmark suites. Results demonstrate that the proposed logic restructuring maximizes logic 0 (or 1) of LUTs by an average of 20\%, achieving 80\% fault masking with no area overhead. The fault rate of the entire design is reduced by 60\% on average as compared to the existing techniques. Furthermore, the logic decomposition algorithm provides incremental fault-tolerance capabilities and achieves an additional 5\% fault masking with an average 7\% increase in slice usage. The complete methodology is implemented into a tool for Xilinx FPGA and is made available online for the benefit of the research community. The algorithms are lightweight, and the whole design flow (including Xilinx Place and Route) was completed in 75 minutes for the largest benchmark in the set.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Istvan:2015:HTL, author = "Zsolt Istv{\'a}n and Gustavo Alonso and Michaela Blott and Kees Vissers", title = "A Hash Table for Line-Rate Data Processing", journal = j-TRETS, volume = "8", number = "2", pages = "13:1--13:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629582", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:20 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGA-based data processing is becoming increasingly relevant in data centers, as the transformation of existing applications into dataflow architectures can bring significant throughput and power benefits. Furthermore, a tighter integration of computing and network is appealing, as it overcomes traditional bottlenecks between CPUs and network interfaces, and dramatically reduces latency. In this article, we present the design of a novel hash table, a fundamental building block used in many applications, to enable data processing on FPGAs close to the network. We present a fully pipelined design capable of sustaining consistent 10Gbps line-rate processing by deploying a concurrent mechanism to handle hash collisions. We address additional design challenges such as support for a broad range of key sizes without stalling the pipeline through careful matching of lookup time with packet reception time. Finally, the design is based on a scalable architecture that can be easily parameterized to work with different memory types operating at different access speeds and latencies. We have tested the proposed hash table in an FPGA-based memcached appliance implementing a main-memory key-value store in hardware. The hash table is used to index 2 million entries in 24GB of external DDR3 DRAM while sustaining 13 million requests per second, the maximum packet rate that can be achieved with UDP packets on a 10Gbps link for this application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Huang:2015:ECO, author = "Qijing Huang and Ruolong Lian and Andrew Canis and Jongsok Choi and Ryan Xi and Nazanin Calagar and Stephen Brown and Jason Anderson", title = "The Effect of Compiler Optimizations on High-Level Synthesis-Generated Hardware", journal = j-TRETS, volume = "8", number = "3", pages = "14:1--14:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629547", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We consider the impact of compiler optimizations on the quality of high-level synthesis (HLS)-generated field-programmable gate array (FPGA) hardware. Using an HLS tool implemented within the state-of-the-art LLVM compiler, we study the effect of compiler optimizations on the hardware metrics of circuit area, execution cycles, FMax, and wall-clock time. We evaluate 56 different compiler optimizations implemented within LLVM and show that some optimizations significantly affect hardware quality. Moreover, we show that hardware quality is also affected by some optimization parameter values, as well as the order in which optimizations are applied. We then present a new HLS-directed approach to compiler optimizations, wherein we execute partial HLS and profiling at intermittent points in the optimization process and use the results to judiciously undo the impact of optimization passes predicted to be damaging to the generated hardware quality. Results show that our approach produces circuits with 16\% better speed performance, on average, versus using the standard {\tt -O3} optimization level.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Niu:2015:AEI, author = "Xinyu Niu and Thomas C. P. Chau and Qiwei Jin and Wayne Luk and Qiang Liu and Oliver Pell", title = "Automating Elimination of Idle Functions by Runtime Reconfiguration", journal = j-TRETS, volume = "8", number = "3", pages = "15:1--15:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700415", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A design approach is proposed to automatically identify and exploit runtime reconfiguration opportunities with optimised resource utilisation by eliminating idle functions. We introduce Reconfiguration Data Flow Graph, a hierarchical graph structure enabling reconfigurable designs to be synthesised in three steps: function analysis, configuration organisation, and runtime solution generation. The synthesised reconfigurable designs are dynamically evaluated and selected under various runtime conditions. Three applications-barrier option pricing, particle filter, and reverse time migration-are used in evaluating the proposed approach. The runtime solutions approximate their theoretical performance by eliminating idle functions and are 1.31 to 2.19 times faster than optimised static designs. FPGA designs developed with the proposed approach are up to 43.8 times faster than optimised CPU reference designs and 1.55 times faster than optimised GPU designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bhasin:2015:EFB, author = "Shivam Bhasin and Jean-Luc Danger and Sylvain Guilley and Wei He", title = "Exploiting {FPGA} Block Memories for Protected Cryptographic Implementations", journal = j-TRETS, volume = "8", number = "3", pages = "16:1--16:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629552", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Modern field programmable gate arrays (FPGAs) are power packed with features to facilitate designers. Availability of features like large block memory (BRAM), digital signal processing cores, and embedded CPU makes the design strategy of FPGAs quite different from ASICs. FPGAs are also widely used in security-critical applications where protection against known attacks is of prime importance. We focus on physical attacks that target physical implementations. To design countermeasures against such attacks, the strategy for FPGA designers should be different from that in ASIC. The available features should be exploited to design compact and strong countermeasures. In this article, we propose methods to exploit the BRAMs in FPGAs for designing compact countermeasures. Internal BRAM can be used to optimize intrinsic countermeasures such as masking and dual-rail logics, which otherwise have significant overhead (at least $ 2 \times $) compared to unprotected ones. The optimizations are applied on a real AES-128 co-processor and tested for area overhead and resistance on Xilinx Virtex-5 chips. The presented masking countermeasure has an overhead of only 16\% when applied on AES. Moreover, the dual-rail precharge logic (DPL) countermeasure has been optimized to pack the whole sequential part in the BRAM, hence enhancing the security. Proper robustness evaluations are conducted to analyze the optimization in terms of area and security.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Eusse:2015:CNP, author = "Juan Fernando Eusse and Christopher Williams and Rainer Leupers", title = "{CoEx}: a Novel Profiling-Based Algorithm\slash Architecture Co-Exploration for {ASIP} Design", journal = j-TRETS, volume = "8", number = "3", pages = "17:1--17:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629563", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Application-Specific Instruction Set Processors (ASIPs) provide the adequate performance/efficiency tradeoff for their particular application domain. Nevertheless, their design methodologies have stagnated during the past decade and are still based on a series of manual and time-consuming iterative steps. Furthermore, there exists a productivity gap between the point where an application is given as the target for processor customization and the time a customized architecture is available. Therefore, new tools are required that reduce the number of design iterations and bridge the aforementioned productivity gap. This can be achieved by (1) profiling technologies that, by adapting to the designer's needs, help to gain insight into application specifications, and (2) prearchitectural design technologies that give early yet accurate feedback on the impact of algorithmic/architectural design decisions. The first requirement is addressed in this article by proposing the multigrained profiling approach, which identifies the profiling needs at each step of ASIP design and lets the designer tailor the level of detail for application inspection. CoEx, a practical implementation of the approach, is also introduced. The second requirement is addressed by creating a prearchitectural estimation engine. This engine couples CoEx reports for an application with an abstract processor model and generates an estimate of the achievable performance. Both CoEx and the performance estimation engine are respectively evaluated for instrumentation-induced execution overhead and accuracy. Finally, the development of an ASIP architecture for an augmented reality computer vision application is presented. The ASIP achieves a gain of six times compared to the original application performance, after being developed in only 2 days.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Das:2015:ETD, author = "Anup Das and Amit Kumar Singh and Akash Kumar", title = "Execution Trace-Driven Energy-Reliability Optimization for Multimedia {MPSoCs}", journal = j-TRETS, volume = "8", number = "3", pages = "18:1--18:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2665071", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Multiprocessor systems-on-chip (MPSoCs) are becoming a popular design choice in current and future technology nodes to accommodate the heterogeneous computing demand of a multitude of applications enabled on these platform. Streaming multimedia and other communication-centric applications constitute a significant fraction of the application space of these devices. The mapping of an application on an MPSoC is an NP-hard problem. This has attracted researchers to solve this problem both as stand-alone (best-effort) and in conjunction with other optimization objectives, such as energy and reliability. Most existing studies on energy-reliability joint optimization are static-that is, design time based. These techniques fail to capture runtime variability such as resource unavailability and dynamism associated with application behaviors, which are typical of multimedia applications. The few studies that consider dynamic mapping of applications do not consider throughput degradation, which directly impacts user satisfaction. This article proposes a runtime technique to analyze the execution trace of an application modeled as Synchronous Data Flow Graphs (SDFGs) to determine its mapping on a multiprocessor system with heterogeneous processing units for different fault scenarios. Further, communication energy is minimized for each of these mappings while satisfying the throughput constraint. Experiments conducted with synthetic and real SDFGs demonstrate that the proposed technique achieves significant improvement with respect to the state-of-the-art approaches in terms of throughput and storage overhead with less than 20\% energy overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ren:2015:EFT, author = "Yu Ren and Leibo Liu and Shouyi Yin and Jie Han and Shaojun Wei", title = "Efficient Fault-Tolerant Topology Reconfiguration Using a Maximum Flow Algorithm", journal = j-TRETS, volume = "8", number = "3", pages = "19:1--19:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700417", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "With an increasing number of processing elements (PEs) integrated on a single chip, fault-tolerant techniques are critical to ensure the reliability of such complex systems. In current reconfigurable architectures, redundant PEs are utilized for fault tolerance. In the presence of faulty PEs, the physical topologies of various chips may be different, so the concept of virtual topology from network embedding problem has been used to alleviate the burden for the operating systems. With limited hardware resources, how to reconfigure a system into the most effective virtual topology such that the maximum repair rate can be reached presents a significant challenge. In this article, a new approach using a maximum flow (MF) algorithm is proposed for an efficient topology reconfiguration in reconfigurable architectures. In this approach, topology reconfiguration is converted into a network flow problem by constructing a directed graph; the solution is then found by using the MF algorithm. This approach optimizes the use of spare PEs with minimal impacts on area, throughput, and delay, and thus it significantly improves the repair rate of faulty PEs. In addition, it achieves a polynomial reconfiguration time. Experimental results show that compared to previous methods, the MF approach increases the probability to repair faulty PEs by up to 50\% using the same redundant resources. Compared to a fault-free system, the throughput only decreases by less than 2.5\% and latency increases by less than 4\%. To consider various types of PEs in a practical application, a cost factor is introduced into the MF algorithm. An enhanced approach using a minimum-cost MF algorithm is further shown to be efficient in the fault-tolerant reconfiguration of heterogeneous reconfigurable architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dobai:2015:LLF, author = "Roland Dobai and Lukas Sekanina", title = "Low-Level Flexible Architecture with Hybrid Reconfiguration for Evolvable Hardware", journal = j-TRETS, volume = "8", number = "3", pages = "20:1--20:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700414", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 19 17:05:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Field-programmable gate arrays (FPGAs) can be considered to be the most popular and successful platform for evolvable hardware. They allow one to establish and later reconfigure candidate solutions. Recent work in the field of evolvable hardware includes the use of virtual and native reconfigurations. Virtual reconfiguration is based on the change of functionality by hardware components implemented on top of FPGA resources. Native reconfiguration changes the FPGA resources directly by means provided by the FPGA manufacturer. Both of these approaches have their disadvantages. The virtual reconfiguration is characterized by lower maximal operational frequency of the resulting solutions, and the native reconfiguration is slower. In this work, a hybrid approach is used merging the advantages while limiting the disadvantages of the virtual and native reconfigurations. The main contribution is the new low-level architecture for evolvable hardware in the new Zynq-7000 all-programmable system-on-chip. The proposed architecture offers high flexibility in comparison with other evolvable hardware systems by considering direct modification of the reconfigurable resources. The impact of the higher reconfiguration time of the native approach is limited by the dense placement of the proposed reconfigurable processing elements. These processing elements also ensure fast evaluation of candidate solutions. The proposed architecture is evaluated by evolutionary design of switching image filters and edge detectors. The experimental results demonstrate advantages over the previous approaches considering the time required for evolution, area overhead, and flexibility.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kirchgessner:2015:LOF, author = "Robert Kirchgessner and Alan D. George and Greg Stitt", title = "Low-Overhead {FPGA} Middleware for Application Portability and Productivity", journal = j-TRETS, volume = "8", number = "4", pages = "21:1--21:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2746404", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Reconfigurable computing devices such as field-programmable gate arrays (FPGAs) offer advantages over fixed-logic CPU and GPU architectures, including improved performance, superior power efficiency, and reconfigurability. The challenge of FPGA application development, however, has limited their acceptance in high-performance computing and high-performance embedded computing applications. FPGA development carries similar difficulties to hardware design, requiring that developers iterate through register-transfer level designs with cycle-level accuracy. Furthermore, the lack of hardware and software standards between FPGA platforms limits productivity and application portability, and makes porting applications between heterogeneous platforms a time-consuming and often challenging process. Recent efforts to improve FPGA productivity using high-level synthesis tools and languages show promise, but platform support remains limited and typically is left as a challenge for developers. To address these issues, we present RC Middleware (RCMW), a novel middleware that improves productivity and enables application and tool portability by abstracting away platform-specific details. RCMW provides an application-centric development environment, exposing only the resources and standardized interfaces required by an application, independent of the underlying platform. We demonstrate the portability and productivity benefits of RCMW using four heterogeneous platforms from three vendors. Our results indicate that RCMW enables application productivity and improves developer productivity, and that these benefits are achieved with less than 7\% performance and 3\% area overhead on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jacobsen:2015:RRI, author = "Matthew Jacobsen and Dustin Richmond and Matthew Hogains and Ryan Kastner", title = "{RIFFA 2.1}: a Reusable Integration Framework for {FPGA} Accelerators", journal = j-TRETS, volume = "8", number = "4", pages = "22:1--22:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2815631", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We present RIFFA 2.1, a reusable integration framework for Field-Programmable Gate Array (FPGA) accelerators. RIFFA provides communication and synchronization for FPGA accelerated applications using simple interfaces for hardware and software. Our goal is to expand the use of FPGAs as an acceleration platform by releasing, as open source, a framework that easily integrates software running on commodity CPUs with FPGA cores. RIFFA uses PCI Express (PCIe) links to connect FPGAs to a CPU's system bus. RIFFA 2.1 supports FPGAs from Xilinx and Altera, Linux and Windows operating systems, and allows multiple FPGAs to connect to a single host PC system. It has software bindings for C/C++, Java, Python, and Matlab. Tests show that data transfers between hardware and software can reach 97\% of the achievable PCIe link bandwidth.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Thomas:2015:THG, author = "David B. Thomas", title = "The Table-{Hadamard} {GRNG}: an Area-Efficient {FPGA} {Gaussian} Random Number Generator", journal = j-TRETS, volume = "8", number = "4", pages = "23:1--23:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2629607", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Gaussian random number generators (GRNGs) are an important component in parallel Monte Carlo simulations using FPGAs, where tens or hundreds of high-quality Gaussian samples must be generated per cycle using very few logic resources. This article describes the Table-Hadamard generator, which is a GRNG designed to generate multiple streams of random numbers in parallel. It uses discrete table distributions to generate pseudo-Gaussian base samples, then a parallel Hadamard transform to efficiently apply the central limit theorem. When generating 64 output samples, the Table-Hadamard requires just 130 slices per generated sample, which is a third of the resources needed by the next best technique, while still providing higher statistical quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jin:2015:MID, author = "Zheming Jin and Jason D. Bakos", title = "Memory Interface Design for {$3$D} Stencil Kernels on a Massively Parallel Memory System", journal = j-TRETS, volume = "8", number = "4", pages = "24:1--24:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2800788", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Massively parallel memory systems are designed to deliver high bandwidth at relatively low clock speed for memory-intensive applications implemented on programmable logic. For example, the Convey HC-1 provides 1,024 DRAM banks to each of four FPGAs through a full crossbar, presenting a peak bandwidth of 76.8GB/s to the user logic. Such highly parallel memory systems suffer from high latency, and their effective bandwidth is highly sensitive to access ordering. To achieve high performance, the user must use a customized memory interface that combines scheduling, latency hiding, and data reuse. In this article, we describe the design of a custom memory interface for 3D stencil kernels on the Convey HC-1 that incorporates these features. Experimental results show that the proposed memory interface achieves a speedup in runtime of 2.2 for 6-point stencil and 9.5 for 27-point stencil when compared to a naive memory interface.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tan:2015:SHP, author = "Guangming Tan and Chunming Zhang and Wendi Wang and Peiheng Zhang", title = "{SuperDragon}: a Heterogeneous Parallel System for Accelerating {$3$D} Reconstruction of Cryo-Electron Microscopy Images", journal = j-TRETS, volume = "8", number = "4", pages = "25:1--25:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2740966", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The data deluge in medical imaging processing requires faster and more efficient systems. Due to the advance in recent heterogeneous architecture, there has been a resurgence in research aimed at domain-specific accelerators. In this article, we develop an experimental system SuperDragon for evaluating acceleration of a single-particle Cryo-electron microscopy (Cryo-EM) 3D reconstruction package EMAN through a hybrid of CPU, GPU, and FPGA parallel architecture. Based on a comprehensive workload characterization, we exploit multigrained parallelism in the Cryo-EM 3D reconstruction algorithm and investigate a proper computational mapping to the underlying heterogeneous architecture. The package is restructured with task-level (MPI), thread-level (OpenMP), and data-level (GPU and FPGA) parallelism. Especially, the proposed FPGA accelerator is a stream architecture that emphasizes the importance of optimizing computing dominated data access patterns. Besides, the configurable computing streams are constructed by arranging the hardware modules and bypassing channels to form a linear deep pipeline. Compared to the multicore (six-core) program, the GPU and FPGA implementations achieve speedups of 8.4 and 2.25 times in execution time while improving power efficiency by factors of 7.2 and 14.2, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Biedermann:2015:SDR, author = "Alexander Biedermann and Sorin A. Huss and Adeel Israr", title = "Safe Dynamic Reshaping of Reconfigurable {MPSoC} Embedded Systems for Self-Healing and Self-Adaption Purposes", journal = j-TRETS, volume = "8", number = "4", pages = "26:1--26:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700416", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Oct 5 08:47:01 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Multiprocessor system-on-chip (MPSoC) architectures are a huge challenge in embedded system design. This situation arises from the fact that available MPSoCs and related designs flows are not tailored to the specific needs of embedded systems. This work demonstrates how to provide self-healing properties in embedded MPSoC design. This is achieved by combining the features of a generic approach to create virtualizable MPSoCs out of off-the-shelf embedded processors with a methodology to derive system configurations, such as task-processor bindings, which are optimal in terms of safety and execution time. The virtualization properties enable a reshaping of the MPSoC at runtime. Thus, system configurations may be exchanged rapidly in a dynamic fashion. As a main result of this work, embedded multiprocessor systems are introduced, which dynamically adapt to changing operating conditions, possible module defects, and internal state changes. We demonstrate the figures of merit of such reconfigurable MPSoC embedded systems by means of a complex automotive application scenario mapped to an FPGA featuring a virtualizable array of eight soft-core processors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Park:2015:PIC, author = "Joonseok Park and Pedro C. Diniz", title = "Program-Invariant Checking for Soft-Error Detection using Reconfigurable Hardware", journal = j-TRETS, volume = "9", number = "1", pages = "1:1--1:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2751563", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "There is an increasing concern about transient errors in deep submicron processor architectures. Software-only error detection approaches that exploit program invariants for silent error detection incur large execution overheads and are unreliable as state can be corrupted after invariant checkpoints. In this article, we explore the use of configurable hardware structures for the continuous evaluation of high-level program invariants at the assembly level. We evaluate the resource requirements and performance of the proposed predicate-evaluation hardware structures when integrated with a 32-bit MIPS soft core on a contemporary reconfigurable hardware device. The results, for a small set of kernel codes, reveal that these hardware structures require a very small number of hardware resources with negligible impact on the processor core that they are integrated in. Moreover, the amount of resources is fairly insensitive to the complexity of the invariants, thus making the proposed structures an attractive alternative to software-only predicate checking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Scicluna:2015:AMF, author = "Neil Scicluna and Christos-Savvas Bouganis", title = "{ARC 2014}: a Multidimensional {FPGA}-Based Parallel {DBSCAN} Architecture", journal = j-TRETS, volume = "9", number = "1", pages = "2:1--2:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2724722", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Clustering large numbers of data points is a very computationally demanding task that often needs to be accelerated in order to be useful in practical applications. This work focuses on the Density-Based Spatial Clustering of Applications with Noise (DBSCAN) algorithm, which is one of the state-of-the-art clustering algorithms, and targets its acceleration using an FPGA device. The article presents an optimized, scalable, and parameterizable architecture that takes advantage of the internal memory structure of modern FPGAs in order to deliver a high-performance clustering system. Post-synthesis simulation results show that the developed system can obtain mean speedups of 31$ \times $ in real-world tests and 202$ \times $ in synthetic tests when compared to state-of-the-art software counterparts running on a quad-core 3.4GHz Intel i7-2600k. Additionally, this implementation is also capable of clustering data with any number of dimensions without impacting the performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sasdrich:2015:ICS, author = "Pascal Sasdrich and Tim G{\"u}neysu", title = "Implementing {Curve25519} for Side-Channel--Protected Elliptic Curve Cryptography", journal = j-TRETS, volume = "9", number = "1", pages = "3:1--3:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700834", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "For security-critical embedded applications Elliptic Curve Cryptography (ECC) has become the predominant cryptographic system for efficient key agreement and digital signatures. However, ECC still involves complex modular arithmetic that is a particular burden for small processors. In this context, Bernstein proposed the highly efficient ECC instance Curve25519 that particularly enables efficient software implementations at a security level comparable to AES-128 with inherent resistance to simple power analysis (SPA) and timing attacks. In this work, we show that Curve25519 is likewise competitive on FPGAs even when countermeasures to thwart side-channel power analysis are included. Our basic multicore DSP-based architectures achieves a maximal performance of more than 32,000 point multiplications per second on a Xilinx Zynq 7020 FPGA. Including a mix of side-channel countermeasures to impede simple and differential power analysis, we still achieve more than 27,500 point multiplications per second with a moderate increase in logic resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2015:EAR, author = "Jianfeng Zhang and Paul Chow and Hengzhu Liu", title = "An Enhanced Adaptive Recoding Rotation {CORDIC}", journal = j-TRETS, volume = "9", number = "1", pages = "4:1--4:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2812813", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/elefunt.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The Conventional Coordinate Rotation Digital Computer (CORDIC) algorithm has been widely used in many applications, particularly in Direct Digital Frequency Synthesizers (DDS) and Fast Fourier Transforms (FFT). However, CORDIC is constrained by the excessive number of iterations, angle data path, and scaling factor compensation. In this article, an enhanced adaptive recoding CORDIC (EARC) is proposed. It uses the enhanced adaptive recoding method to reduce the required iterations and adopts the trigonometric transformation scheme to scale up the rotation angles. Computing sine and cosine is used first to compare the core functionality of EARC with basic CORDIC; then a 16-bit DDS and a 1,024-point FFT based on EARC are evaluated to demonstrate the benefits of EARC in larger applications. All the proposed architectures are validated on a Virtex 5 FPGA development platform. Compared with a commercial implementation of CORDIC, EARC requires 33.3\% less hardware resources, provides a twofold speedup, dissipates 70.4\% less power, and improves accuracy in terms of the Bit Error Position (BEP). Compared to the state-of-the-art Hybrid CORDIC, EARC reduces latency by 11.1\% and consumes 17\% less power. Compared with a commercial implementation of DDS, the dissipated power of the proposed DDS is reduced by 27.2\%. The proposed DDS improves Spurious-Free Dynamic Range (SFDR) by nearly 7 dBc and dissipates 21.8\% less power when compared with a recently published DDS circuit. The FFT based on EARC dissipates a factor of 2.05 less power than the commercial FFT even when choosing the 100\% toggle rate for the FFT based on EARC and the 12.5\% toggle rate for the commercial FFT. Compared with a recently published FFT, the FFT based on EARC improves Signal-to-Noise Ratio (SNR) by 8.9 dB and consumes 7.78\% less power.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Goehringer:2015:GEA, author = "Diana Goehringer and Marco D. Santambrogio and Jo{\~a}o M. P. Cardoso and Koen Bertels", title = "Guest Editorial: {ARC 2014}", journal = j-TRETS, volume = "9", number = "1", pages = "5:1--5:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2831431", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Heyse:2015:IRL, author = "Karel Heyse and Jente Basteleus and Brahim {Al Farisi} and Dirk Stroobandt and Oliver Kadlcek and Oliver Pell", title = "On the Impact of Replacing Low-Speed Configuration Buses on {FPGAs} with the Chip's Internal Configuration Infrastructure", journal = j-TRETS, volume = "9", number = "1", pages = "6:1--6:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700835", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "It is common for large hardware designs to have a number of registers or memories whose contents have to be changed very seldom (e.g., only at startup). The conventional way of accessing these memories is through a low-speed memory bus. This bus uses valuable hardware resources, introduces long global connections, and contributes to routing congestion. Hence, it has an impact on the overall design even though it is only rarely used. A Field-Programmable Gate Array (FPGA) already contains a global communication mechanism in the form of its configuration infrastructure. In this article, we evaluate the use of the configuration infrastructure as a replacement for a low-speed memory bus on the Maxeler HPC platform. We find that by removing the conventional low-speed memory bus, the maximum clock frequency of some applications can be improved by 8\%. Improvements by 25\% and more are also attainable, but constraints of the Xilinx reconfiguration infrastructure prevent fully exploiting these benefits at the moment. We present a number of possible changes to the Xilinx reconfiguration infrastructure and tools that would solve this and make these results more widely applicable.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Duarte:2015:ACK, author = "Rui Policarpo Duarte and Christos-Savvas Bouganis", title = "{ARC 2014} Over-Clocking {KLT} Designs on {FPGAs} under Process, Voltage, and Temperature Variation", journal = j-TRETS, volume = "9", number = "1", pages = "7:1--7:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2818380", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Karhunen-Loeve Transformation is a widely used algorithm in signal processing that often implemented with high-throughput requisites. This work presents a novel methodology to optimise KLT designs on FPGAs that outperform typical design methodologies, through a prior characterisation of the arithmetic units in the datapath of the circuit under various operating conditions. Limited by the ever-increasing process variation, the delay models available in synthesis tools are no longer suitable for extreme performance optimisation of designs, and as they are generic, they need to consider the worst-case performance for a given fabrication process. Hence, they heavily penalise the maximum possible achieved performance of a design by leaving safety margin. This work presents a novel unified optimisation framework which contemplates a prior characterisation of the embedded multipliers on the target FPGA device under process, voltage, and temperature variation. The proposed framework allows a design space exploration leading to designs without any latency overheads that achieve high throughput while producing less errors than typical methodologies, operating with the same throughput. Experimental results demonstrate that the proposed methodology outperforms the typical implementation in three real-life design strategies: high performance, low power, and temperature variation; and it produced circuit designs that performed up to 18dB better when over-clocked.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bai:2015:ATF, author = "Yuhui Bai and Syed Zahid Ahmed and Bertrand Granado", title = "{ARC 2014}: Towards a Fast {FPGA} Implementation of a Heap-Based Priority Queue for Image Coding Using a Parallel Index-Aware Tree", journal = j-TRETS, volume = "9", number = "1", pages = "8:1--8:??", month = nov, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2766454", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The embedded image processing systems like smartphones and digital cameras have tight limits on storage, computation power, network connectivity, and battery usage. These limitations make it important to ensure efficient image coding. In the article, we present a novel heap-based priority queue structure employed by an Adaptive Scanning of Wavelet Data scheme (ASWD) targeting an embedded platform. ASWD is a context modeling block implemented via priority queues in a wavelet-based image coder to reorganize the wavelet coefficients into locally stationary sequences. The architecture we propose exploits efficient use of FPGA's on-chip dual-port memories in an adaptive manner. Innovations of index-aware system linked to each element in the queue makes the location of queue element traceable in the heap as per the requirements of the ASWD algorithm. Moreover, use of 4-port memories along with intelligent data concatenation of queue elements yielded in a cost effective enhanced memory access. The memory ports are adaptively assigned to different units during different processing phases in a manner to optimally take advantage of memory access required by that phase. The architectural innovations can also be exploited in other applications that require efficient hardware implementations of generic priority queue or classical sorting applications which sort into the index. We designed and validated the hardware on an Altera's Stratix IV FPGA as an IP accelerator in a Nios II processor based System on Chip. We show that our architecture at 150MHz can provide 45X speedup compared to an embedded ARM Cortex-A9 processor at 666MHz targeting the throughput of 10MB/s.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2016:CBE, author = "Jianfeng Zhang and Paul Chow and Hengzhu Liu", title = "{CORDIC}-Based Enhanced Systolic Array Architecture for {$ Q R $} Decomposition", journal = j-TRETS, volume = "9", number = "2", pages = "9:1--9:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2827700", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Multiple input multiple output (MIMO) with orthogonal frequency division multiplexing (OFDM) systems typically use orthogonal-triangular (QR) decomposition. In this article, we present an enhanced systolic array architecture to realize QR decomposition based on the Givens rotation (GR) method for a 4 $ \times $ 4 real matrix. The coordinate rotation digital computer (CORDIC) algorithm is adopted and modified to speed up and simplify the process of GR. To verify the function and evaluate the performance, the proposed architectures are validated on a Virtex 5 FPGA development platform. Compared to a commercial implementation of vectoring CORDIC, the enhanced vectoring CORDIC is presented that uses 37.7\% less hardware resources, dissipates 71.6\% less power, and provides a 1.8 times speedup while maintaining the same computation accuracy. The enhanced QR systolic array architecture based on the enhanced vectoring CORDIC saves 24.5\% in power dissipation, provides a factor of 1.5-fold improvement in throughput, and the hardware efficiency is improved 1.45-fold with no accuracy penalty when compared to our previously proposed QR systolic array architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Winterstein:2016:SLH, author = "Felix J. Winterstein and Samuel R. Bayliss and George A. Constantinides", title = "Separation Logic for High-Level Synthesis", journal = j-TRETS, volume = "9", number = "2", pages = "10:1--10:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2836169", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "High-Level Synthesis (HLS) promises a significant shortening of the FPGA design cycle by raising the abstraction level of the design entry to high-level languages such as C/C++. However, applications using dynamic, pointer-based data structures and dynamic memory allocation remain difficult to implement well, yet such constructs are widely used in software. Automated optimizations that leverage the memory bandwidth of FPGAs by distributing the application data over separate banks of on-chip memory are often ineffective in the presence of dynamic data structures due to the lack of an automated analysis of pointer-based memory accesses. In this work, we take a step toward closing this gap. We present a static analysis for pointer-manipulating programs that automatically splits heap-allocated data structures into disjoint, independent regions. The analysis leverages recent advances in separation logic, a theoretical framework for reasoning about heap-allocated data that has been successfully applied in recent software verification tools. Our algorithm focuses on dynamic data structures accessed in loops and is accompanied by automated source-to-source transformations that enable automatic loop parallelization and memory partitioning by off-the-shelf HLS tools. We demonstrate the successful loop parallelization and memory partitioning by our tool flow using three real-life applications that build, traverse, update, and dispose of dynamically allocated data structures. Our case studies, comparing the automatically parallelized to the direct HLS implementations, show an average latency reduction by a factor of 2 $ \times $ across our benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xu:2016:CGA, author = "Jinwei Xu and Jingfei Jiang and Yong Dou and Xiaolong Shen and Zhiqiang Liu", title = "Coarse-Grained Architecture for Fingerprint Matching", journal = j-TRETS, volume = "9", number = "2", pages = "12:1--12:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2791296", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Fingerprint matching is a key procedure in fingerprint identification applications. The minutiae-based fingerprint matching algorithm is one of the most typical algorithms achieving a reasonably correct recognition rate. This study proposes a coarse-grained parallel architecture called fingerprint matching core (FMC) to accelerate fingerprint matching. The proposed architecture has a two-level parallel structure (i.e., parallel among groups (PAG) and parallel in group (PIG)). A multirequest controller is added to the PAG structure to obtain a concurrent operation of the multiple processing element group (PEG). The DDR3 controller is used in the PIG structure to read eight minutiae from eight different fingerprints and realize the simultaneous computation of the eight PEs. The whole system is implemented on a Xilinx FPGA board with a Virtex VII XC7VX485T chip. The 16-PEG FMC achieves a throughput of about 9.63 million fingerprint pairs per second, which is larger than that achieved on a Tesla K20c platform. The software execution times are also measured on the 2.93GHz Intel Xeon 5670, 2.3GHz AMD Opteron(tm) Processor 6376, and Tesla K20c platforms. The Intel Xeon 5670 has two processors with 12 cores, and the AMD Opteron(tm) Processor 6376 has two processors with 16 cores. Moreover, the throughput is about 31 times that achieved on a 2.93GHz Intel Xeon 5670 single core.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zaidi:2016:VSF, author = "Ali Mustafa Zaidi and David Greaves", title = "Value State Flow Graph: a Dataflow Compiler {IR} for Accelerating Control-Intensive Code in Spatial Hardware", journal = j-TRETS, volume = "9", number = "2", pages = "14:1--14:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2807702", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Although custom (and reconfigurable) computing can provide orders-of-magnitude improvements in energy efficiency and performance for many numeric, data-parallel applications, performance on nonnumeric, sequential code is often worse than conventional superscalar processors. This work attempts to improve sequential performance in custom hardware by (a) switching from a statically scheduled to a dynamically scheduled (dataflow) execution model and (b) developing a new compiler IR for high-level synthesis-the value state flow graph (VSFG)-that enables aggressive exposition of ILP even in the presence of complex control flow. Compared to existing control-data flow graph (CDFG)-based IRs, the VSFG exposes more instruction-level parallelism from control-intensive sequential code by exploiting aggressive speculation, enabling control dependence analysis, as well as execution along multiple flows of control. This new IR is directly implemented as a static-dataflow graph in hardware by our prototype high-level synthesis tool chain and shows an average speedup of 1.13$ \times $ over equivalent hardware generated using LegUp, an existing CDFG-based HLS tool. Furthermore, the VSFG allows us to further trade area and energy for performance through loop unrolling, increasing the average speedup to 1.55$ \times $, with a peak speedup of 4.05$ \times $. Our VSFG-based hardware approaches the sequential cycle counts of an Intel Nehalem Core i7 processor while consuming only 0.25$ \times $ the energy of an in-order Altera Nios II f processor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Raitza:2016:RRN, author = "Michael Raitza and Markus Vogt and Christian Hochberger and Thilo Pionteck", title = "{RAW 2014}: Random Number Generators on {FPGAs}", journal = j-TRETS, volume = "9", number = "2", pages = "15:1--15:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2807699", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Random numbers are important ingredients in a number of applications. Especially in a security context, they must be well distributed and unpredictable. We investigate the practical use of random number generators (RNGs) that are built from digital elements found in FPGAs. For this, we implement different types of ring oscillators (ROs) and memory collision-based circuits on FPGAs from major vendors. Implementing RNGs on the same device as the rest of the system benefits an overall reduction of vulnerability to attacks and wire tapping. Nevertheless, we investigate different attacks by tampering with power supply, chip temperature, and by exposition to strong magnetic fields and X-radiation. We also consider their usability as massively deployed components, whose functionality cannot be tested individually anymore, by conducting a technology invariance experiment. Our experiments show that BlockRAM-based RNGs cannot be considered as a suitable entropy source. We further show that RO-based RNGs work reliably under a wide range of operating conditions. While magnetic fields and X-rays did not induce any notable change, voltage and temperature variations caused an increase in propagation delays within the circuits. We show how reliable RNGs can be constructed and deployed on FPGAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Attia:2016:RAD, author = "Osama G. Attia and Kevin R. Townsend and Phillip H. Jones and Joseph Zambreno", title = "A Reconfigurable Architecture for the Detection of Strongly Connected Components", journal = j-TRETS, volume = "9", number = "2", pages = "16:1--16:??", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2807700", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 22 16:19:57 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The Strongly Connected Components (SCCs) detection algorithm serves as a keystone for many graph analysis applications. The SCC execution time for large-scale graphs, as with many other graph algorithms, is dominated by memory latency. In this article, we investigate the design of a parallel hardware architecture for the detection of SCCs in directed graphs. We propose a design methodology that alleviates memory latency and problems with irregular memory access. The design is composed of 16 processing elements dedicated to parallel Breadth-First Search (BFS) and eight processing elements dedicated to finding intersection in parallel. Processing elements are organized to reuse resources and utilize memory bandwidth efficiently. We demonstrate a prototype of our design using the Convey HC-2 system, a commercial high-performance reconfigurable computing coprocessor. Our experimental results show a speedup of as much as 17$ \times $ for detecting SCCs in large-scale graphs when compared to a conventional sequential software implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kapre:2016:OSV, author = "Nachiket Kapre", title = "Optimizing Soft Vector Processing in {FPGA}-Based Embedded Systems", journal = j-TRETS, volume = "9", number = "3", pages = "17:1--17:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2912884", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jul 14 16:35:43 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Soft vector processors can augment and extend the capability of FPGA-based embedded systems-on-chip such as the Xilinx Zynq. However, configuring and optimizing the soft processor for best performance is hard. We must consider architectural parameters such as precision, vector lane count, vector length, chunk size, and DMA scheduling to ensure efficient execution of code on the soft vector processing platform. To simplify the design process, we develop a compiler framework and an autotuning runtime that splits the optimization into a combination of static and dynamic passes that map data-parallel computations to the soft processor. We compare and contrast implementations running on the scalar ARM processor, the embedded NEON hard vector engine, and low-level streaming Verilog designs with the VectorBlox MXP soft vector processor. Across a range of data-parallel benchmarks, we show that the MXP soft vector processor can outperform other organizations by up to $ 4 \times $ while saving $ \approx 10 \% $ dynamic power. Our compilation and runtime framework is also able to outperform the gcc NEON vectorizer under certain conditions by explicit generation of NEON intrinsics and performance tuning of the autogenerated data-parallel code. When constrained by IO bandwidth, soft vector processors are even competitive with spatial Verilog implementations of computation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dehon:2016:ISI, author = "Andr{\'e} Dehon and Derek Chiou", title = "Introduction to Special Issue on Reconfigurable Components with Source Code", journal = j-TRETS, volume = "9", number = "3", pages = "19:1--19:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2907949", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jul 14 16:35:43 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fang:2016:OSV, author = "Xin Fang and Miriam Leeser", title = "Open-Source Variable-Precision Floating-Point Library for Major Commercial {FPGAs}", journal = j-TRETS, volume = "9", number = "3", pages = "1--17", month = jul, year = "2016", DOI = "https://doi.org/10.1145/2851507", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Feb 8 10:53:20 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/2851507", abstract = "There is increased interest in implementing floating-point designs for different precisions that take advantage of the flexibility offered by Field-Programmable Gate Arrays (FPGAs). In this article, we present updates to the Variable-precision FLOATing Point Library (VFLOAT) developed at Northeastern University and highlight recent improvements in implementations for implementing reciprocal, division, and square root components that scale to double precision for FPGAs from the two major vendors: Altera and Xilinx. Our library is open source and flexible and provides the user with many options. A designer has many tradeoffs to consider including clock frequency, total latency, and resource usage as well as target architecture. We compare the generated cores to those produced by each vendor and to another popular open-source tool: FloPoCo. VFLOAT has the advantage of not tying the user s design to a specific target architecture and of providing the maximum flexibility for all options including clock frequency and latency compared to other alternatives. Our results show that variable-precision as well as double-precision designs can easily be accommodated and the resulting components are competitive and in many cases superior to the alternatives.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wilson:2016:UAA, author = "David Wilson and Greg Stitt", title = "The Unified Accumulator Architecture: a Configurable, Portable, and Extensible Floating-Point Accumulator", journal = j-TRETS, volume = "9", number = "3", pages = "21:1--21:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2809432", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jul 14 16:35:43 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Applications accelerated by field-programmable gate arrays (FPGAs) often require pipelined floating-point accumulators with a variety of different trade-offs. Although previous work has introduced numerous floating-point accumulation architectures, few cores are available for public use, which forces designers to use fixed-point implementations or vendor-provided cores that are not portable and are often not optimized for the desired set of trade-offs. In this article, we combine and extend previous floating-point accumulator architectures into a configurable, open-source core, referred to as the unified accumulator architecture (UAA), which enables designers to choose between different trade-offs for different applications. UAA is portable across FPGAs and allows designers to specialize the underlying adder core to take advantage of device-specific optimizations. By providing an extensible, open-source implementation, we hope for the research community to extend the provided core with new architectures and optimizations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abdelhadi:2016:MSM, author = "Ameer M. S. Abdelhadi and Guy G. F. Lemieux", title = "Modular Switched Multiported {SRAM}-Based Memories", journal = j-TRETS, volume = "9", number = "3", pages = "22:1--22:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2851506", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jul 14 16:35:43 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Multiported RAMs are essential for high-performance parallel computation systems. VLIW and vector processors, CGRAs, DSPs, CMPs, and other processing systems often rely upon multiported memories for parallel access. Although memories with a large number of read and write ports are important, their high implementation cost means that they are used sparingly. As a result, FPGA vendors only provide dual-ported block RAMs (BRAMs) to handle the majority of usage patterns. Furthermore, recent attempts to create FPGA-based multiported memories suffer from low storage utilization. Whereas most approaches provide simple unidirectional ports with a fixed read or write, others propose true bidirectional ports where each port dynamically switches read and write. True RAM ports are useful for systems with transceivers and provide high RAM flexibility; however, this flexibility incurs high BRAM consumption. In this article, a novel, modular, and BRAM-based switched multiported RAM architecture is proposed. In addition to unidirectional ports with fixed read/write, this switched architecture allows a group of write ports to switch with another group of read ports dynamically, hence altering the number of active ports. The proposed switched-ports architecture is less flexible than a true-multiported RAM where each port is switched individually. Nevertheless, switched memories can dramatically reduce BRAM consumption compared to true ports for systems with alternating port requirements. Previous live-value-table (LVT) and XOR approaches are merged and optimized into a generalized and modular structure that we call an invalidation-based live-value-table (I-LVT). Like a regular LVT, the I-LVT determines the correct bank to read from, but it differs in how updates to the table are made; the LVT approach requires multiple write ports, often leading to an area-intensive register-based implementation, whereas the XOR approach suffers from excessive storage overhead since wider memories are required to accommodate the XOR-ed data. Two specific I-LVT implementations are proposed and evaluated: binary and thermometer coding. The I-LVT approach is especially suitable for deep memories because the table is implemented only in SRAM cells. The I-LVT method gives higher performance while occupying fewer BRAMs than earlier approaches: for several configurations, BRAM usage is reduced by greater than 44\% and clock speed is improved by greater than 76\%. The I-LVT can be used with fixed ports, true ports, or the proposed switched ports architectures. Formal proofs for the suggested methods, resources consumption analysis, usage guidelines, and analytic comparison to other methods are provided. A fully parameterized Verilog implementation is released as an open source library. The library has been extensively tested using Altera's EDA tools.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Stitt:2016:PSW, author = "Greg Stitt and Eric Schwartz and Patrick Cooke", title = "A Parallel Sliding-Window Generator for High-Performance Digital-Signal Processing on {FPGAs}", journal = j-TRETS, volume = "9", number = "3", pages = "23:1--23:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2800789", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jul 14 16:35:43 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Sliding-window applications, an important class of the digital-signal processing domain, are highly amenable to pipeline parallelism on field-programmable gate arrays (FPGAs). Although memory bandwidth often restricts parallelism for many applications, sliding-window applications can leverage custom buffers, referred to as sliding-window generators, that provide massive input bandwidth that far exceeds the capabilities of external memory. Previous work has introduced a variety of sliding-window generators, but those approaches typically generate at most one window per cycle, which significantly restricts parallelism. In this article, we address this limitation with a parallel sliding-window generator that can generate a configurable number of windows every cycle. Although in practice the number of parallel windows is limited by memory bandwidth, we show that even with common bandwidth limitations, the presented generator enables near-linear speedups up to 16x faster than previous FPGA studies that generate a single window per cycle, which were already in some cases faster than graphics-processing units and microprocessors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ul-Abdin:2016:RCF, author = "Zain Ul-Abdin and Bertil Svensson", title = "A Retargetable Compilation Framework for Heterogeneous Reconfigurable Computing", journal = j-TRETS, volume = "9", number = "4", pages = "24:1--24:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2843946", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The future trend in microprocessors for the more advanced embedded systems is focusing on massively parallel reconfigurable architectures, consisting of heterogeneous ensembles of hundreds of processing elements communicating over a reconfigurable interconnection network. However, the mastering of low-level microarchitectural details involved in the programming of such massively parallel platforms becomes too cumbersome, which limits their adoption in many applications. Thus, there is a dire need for an approach to produce high-performance scalable implementations that harness the computational resources of the emerging reconfigurable platforms. This article addresses the grand challenge of accessibility of these diverse reconfigurable platforms by suggesting the use of a high-level language, occam-pi, and developing a complete design flow for building, compiling, and generating machine code for heterogeneous coarse-grained hardware. We have evaluated the approach by implementing complex industrial case studies and three common signal processing algorithms. The results of the implemented case studies suggest that the occam-pi language-based approach, because of its well-defined semantics for expressing concurrency and reconfigurability, simplifies the development of applications employing runtime reconfigurable devices. The associated compiler framework ensures portability as well as the performance benefits across heterogeneous platforms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ziener:2016:FBD, author = "Daniel Ziener and Florian Bauer and Andreas Becher and Christopher Dennl and Klaus Meyer-Wegener and Ute Sch{\"u}rfeld and J{\"u}rgen Teich and J{\"o}rg-Stephan Vogt and Helmut Weber", title = "{FPGA}-Based Dynamically Reconfigurable {SQL} Query Processing", journal = j-TRETS, volume = "9", number = "4", pages = "25:1--25:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2845087", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we propose an FPGA-based SQL query processing approach exploiting the capabilities of partial dynamic reconfiguration of modern FPGAs. After the analysis of an incoming query, a query-specific hardware processing unit is generated on the fly and loaded on the FPGA for immediate query execution. For each query, a specialized hardware accelerator pipeline is composed and configured on the FPGA from a set of presynthesized hardware modules. These partially reconfigurable hardware modules are gathered in a library covering all major SQL operations like restrictions and aggregations, as well as more complex operations such as joins and sorts. Moreover, this holistic query processing approach in hardware supports different data processing strategies including row- as column-wise data processing in order to optimize data communication and processing. This article gives an overview of the proposed query processing methodology and the corresponding library of modules. Additionally, a performance analysis is introduced that is able to estimate the processing time of a query for different processing strategies and different communication and processing architecture configurations. With the help of this performance analysis, architectural bottlenecks may be exposed and future optimized architectures, besides the two prototypes presented here, may be determined.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Matthews:2016:SMM, author = "Eric Matthews and Lesley Shannon and Alexandra Fedorova", title = "Shared Memory Multicore {MicroBlaze} System with {SMP} {Linux} Support", journal = j-TRETS, volume = "9", number = "4", pages = "26:1--26:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2870638", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this work, we present PolyBlaze, a scalable and configurable multicore platform for FPGA-based embedded systems and systems research. PolyBlaze is an extension of the MicroBlaze soft processor, leveraging the configurability of the MicroBlaze and bringing it into the multicore era with Linux Symmetric Multi-Processor (SMP) support. This work details the hardware modifications required for the MicroBlaze processor and its software stack to enable fully validated SMP operations, including atomic operation support, shared interrupts and timers, and exception handling. New in this work, we present a scalable and flexible memory hierarchy optimized for Field Programmable Gate Arrays (FPGAs), which manages atomic operations and provides support for future flexible memory hierarchies and heterogeneous systems. Also new is an in-depth analysis of key performance characteristics, including memory bandwidth, latency, and resource usage. For all system configurations, bandwidth is found to scale linearly with the addition of processor cores until the memory interface is saturated. Additionally, average memory latency remains constant until the memory interface is saturated; after which, it scales linearly with each additional processor core.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yu:2016:OAH, author = "Ting Yu and Chris Bradley and Oliver Sinnen", title = "{ODoST}: Automatic Hardware Acceleration for Biomedical Model Integration", journal = j-TRETS, volume = "9", number = "4", pages = "27:1--27:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2870639", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Dynamic biomedical systems are mathematically described by Ordinary Differential Equations (ODEs) and their solution is often one of the most computationally intensive parts in biomedical simulations. With high inherent parallelism, hardware acceleration based on Field-Programmable Gate Arrays (FPGAs) has great potential to increase the computational performance of the model simulations, while being very power-efficient. However, the manual hardware implementation is complex and time consuming. The advantages of FPGA designs can only be realised if there is a general solution to automate the process. In this article, we propose a domain-specific high-level synthesis tool called ODoST that automatically generates an FPGA-based Hardware Accelerator Module (HAM) from a high-level description. In this direct approach, ODE equations are directly mapped to processing pipelines without any intermediate architecture layer of processing elements. We evaluate the generated HAMs on real hardware based on their resource usage, processing speed, and power consumption, and compare them with CPUs and a GPU. The results show that FPGA implementations can achieve 15.3 times more speedup compared to a single core CPU solution and perform similarly to an auto-generated GPU solution, while the FPGA implementations can achieve 14.5 times more power efficiency than the CPU and 3.1 times compared to the optimised GPU solution. Improved speedups are foreseeable based on further optimisations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2016:I, author = "Deming Chen", title = "Introduction", journal = j-TRETS, volume = "9", number = "4", pages = "28:1--28:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2955103", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wegley:2016:ASD, author = "Evan Wegley and Yanhua Yi and Qinhai Zhang", title = "Application of Specific Delay Window Routing for Timing Optimization in {FPGA} Designs", journal = j-TRETS, volume = "9", number = "4", pages = "29:1--29:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2892640", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In addition to optimizing for long-path timing and routability, commercial FPGA routing engines must also optimize for various timing constraints, enabling users to fine tune their designs. These timing constraints involve both long- and short-path timing requirements. The intricacies of commercial FPGA architectures add difficulty to the problem of supporting such constraints. In this work, we introduce specific delay window routing as a general method for optimization during the routing stage of the FPGA design flow, which can be applied to various timing constraints constituting both long- and short-path requirements. Furthermore, we propose a key adjustment to standard FPGA routing technology for the purposes of specific delay window routing. By using dual-wave expansion instead of traditional single-wave expansion, we solve the critical issue of inaccurate delay estimation in our wave search, which would otherwise make routing according to a specific delay window difficult. Our results show that this dual-wave method can support stricter timing constraints than the standard single-wave method. For a suite of designs with constraints requiring connections to meet a target delay within 250ps, our dual-wave method could satisfy the requirement for all designs, whereas the single-wave method failed for more than two thirds of the designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kadric:2016:IPM, author = "Edin Kadric and David Lakata and Andr{\'e} Dehon", title = "Impact of Parallelism and Memory Architecture on {FPGA} Communication Energy", journal = j-TRETS, volume = "9", number = "4", pages = "30:1--30:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2857057", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The energy in FPGA computations is dominated by data communication energy, either in the form of memory references or data movement on interconnect. In this article, we explore how to use data placement and parallelism to reduce communication energy. We show that parallelism can reduce energy and that the optimal level of parallelism increases with the problem size. We further explore how FPGA memory architecture (memory block size(s), memory banking, and spacing between memory banks) can impact communication energy, and determine how to organize the memory architecture to guarantee that the energy overhead compared to the optimally matched architecture for the design is never more than 60\%. We specifically show that an architecture with 32 bit wide, 16Kb internally banked memories placed every 8 columns of 10 4-LUT logic blocks is within 61\% of the optimally matched architecture across the VTR 7 benchmark set and a set of parallelism-tunable benchmarks. Without internal banking, the worst-case overhead is 98\%, achieved with an architecture with 32 bit wide, 8Kb memories placed every 9 columns, roughly comparable to the memory organization on the Cyclone V (where memories are placed about every 10 columns). Monolithic 32 bit wide, 16Kb memories placed every 10 columns (comparable to 18Kb and 20Kb memories used in Virtex 4 and Stratix V FPGAs) have a 180\% worst-case energy overhead. Furthermore, we show practical cases where designs mapped for optimal parallelism use $ 4.7 \times $ less energy than designs using a single processing element.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rodionov:2016:FGI, author = "Alex Rodionov and David Biancolin and Jonathan Rose", title = "Fine-Grained Interconnect Synthesis", journal = j-TRETS, volume = "9", number = "4", pages = "31:1--31:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2892641", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:08 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "One of the key challenges for the FPGA industry going forward is to make the task of designing hardware easier. A significant portion of that design task is the creation of the interconnect pathways between functional structures. We present a synthesis tool that automates this process and focuses on the interconnect needs in the fine-grained (sub-IP-block) design space. Here there are several issues that prior research and tools do not address well: the need to have fixed, deterministic latency between communicating units (to enable high-performance local communication without the area overheads of latency insensitivity), and the ability to avoid generating unnecessary arbitration hardware when the application design can avoid it. Using a design example, our tool generates interconnect that requires 69\% fewer lines of specification code than a handwritten Verilog implementation, which is a 32\% overall reduction for the entire application. The resulting system, while requiring 6\% more total functional and interconnect area, achieves the same performance. We also show a quantitative and qualitative advantages against an existing commercial interconnect synthesis tool, over which we achieve a 25\% performance advantage and 15\%/57\% logic/memory area savings.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wulf:2016:FEO, author = "Nicholas Wulf and Alan D. George and Ann Gordon-Ross", title = "A Framework for Evaluating and Optimizing {FPGA}-Based {SoCs} for Aerospace Computing", journal = j-TRETS, volume = "10", number = "1", pages = "1:1--1:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2888400", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "On-board processing systems are often deployed in harsh aerospace environments and must therefore adhere to stringent constraints such as low power, small size, and high dependability in the presence of faults. Field-programmable gate arrays (FPGAs) are often an attractive option for designers seeking low-power, high-performance devices. However, unlike nonreconfigurable devices, radiation effects can alter an FPGA's functionality instead of just the device's data, requiring designers to consider fault-tolerant strategies to mitigate these effects. In this article, we present a framework to ease these system design challenges and aid designers in considering a broad range of devices and fault-tolerant strategies for on-board processing, highlighting the most promising options and tradeoffs early in the design process. This article focuses on the power, dependability, and lifetime evaluation metrics, which our framework calculates and leverages to evaluate the effectiveness of varying system-on-chip (SoC) designs. Finally, we use our framework to evaluate SoC designs for a case study on a hyperspectral-imaging (HSI) mission to demonstrate our framework's ability to identify efficient and effective SoC designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Richardson:2016:AFR, author = "Justin Richardson and Alan George and Kevin Cheng and Herman Lam", title = "Analysis of Fixed, Reconfigurable, and Hybrid Devices with Computational, Memory, {I/O}, \& Realizable-Utilization Metrics", journal = j-TRETS, volume = "10", number = "1", pages = "2:1--2:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2888401", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The modern processor landscape is a varied and diverse community. As such, developers need a way to quickly and fairly compare various devices for use with particular applications. This article expands the authors' previously published computational-density metrics and presents an analysis of a new generation of various device architectures, including CPU, DSP, FPGA, GPU, and hybrid architectures. Also, new memory metrics are added to expand the existing suite of metrics to characterize the memory resources on various processing devices. Finally, a new relational metric, realizable utilization (RU), is introduced, which quantifies the fraction of the computational density metric that an application achieves within an individual implementation. The RU metric can be used to provide valuable feedback to application developers and architecture designers by highlighting the upper bound on specific application optimization and providing a quantifiable measure of theoretical and realizable performance. Overall, the analysis in this article quantifies the performance tradeoffs among the architectures studied, the memory characteristics of different device types, and the efficiency of device architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chao:2016:DTM, author = "Hung-Lin Chao and Sheng-Ya Tung and Pao-Ann Hsiung", title = "Dynamic Task Mapping with Congestion Speculation for Reconfigurable Network-on-Chip", journal = j-TRETS, volume = "10", number = "1", pages = "3:1--3:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2892633", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Network-on-Chip (NoC) has been proposed as a promising communication architecture to replace the dedicated interconnections and shared buses for future embedded system platforms. In such a parallel platform, mapping application tasks to the NoC is a key issue because it affects throughput significantly due to the problem of communication congestion. Increased communication latency, low system performance, and low resource utilization are some side-effects of a bad mapping. Current mapping algorithms either do not consider link utilizations or consider only the current utilizations. Besides, to design an efficient NoC platform, mapping task to computation nodes and scheduling communication should be taken into consideration. In this work, we propose an efficient algorithm for dynamic task mapping with congestion speculation (DTMCS) that not only includes the conventional application mapping, but also further considers future traffic patterns based on the link utilization. The proposed algorithm can reduce overall congestion, instead of only improving the current packet blocking situation. Our experiment results have demonstrated that compared to the state-of-the-art congestion-aware Path Load algorithm, the proposed DTMCS algorithm can reduce up to 40.5\% of average communication latency, while the maximal communication latency can be reduced by up to 67.7\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{LeGal:2016:FSM, author = "Bertrand {Le Gal} and Y{\'e}rom-David Bromberg and Laurent R{\'e}veill{\`e}re and Jigar Solanki", title = "A Flexible {SoC} and Its Methodology for Parser-Based Applications", journal = j-TRETS, volume = "10", number = "1", pages = "4:1--4:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2939379", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Embedded systems are being increasingly network interconnected. They are required to interact with their environment through text-based protocol messages. Parsing such messages is control dominated. The work presented in this article attempts to accelerate message parsers using a codesign-based approach. We propose a generic architecture associated with an automated design methodology that enables SoC/SoPC system generation from high-level specifications of message protocols. Experimental results obtained on a Xilinx ML605 board show acceleration factors ranging from four to 11. Both static and dynamic reconfigurations of coprocessors are discussed and then evaluated so as to reduce the system hardware complexity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Pang:2016:MKR, author = "Yeyong Pang and Shaojun Wang and Yu Peng and Xiyuan Peng and Nicholas J. Fraser and Philip H. W. Leong", title = "A Microcoded Kernel Recursive Least Squares Processor Using {FPGA} Technology", journal = j-TRETS, volume = "10", number = "1", pages = "5:1--5:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2950061", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Kernel methods utilize linear methods in a nonlinear feature space and combine the advantages of both. Online kernel methods, such as kernel recursive least squares (KRLS) and kernel normalized least mean squares (KNLMS), perform nonlinear regression in a recursive manner, with similar computational requirements to linear techniques. In this article, an architecture for a microcoded kernel method accelerator is described, and high-performance implementations of sliding-window KRLS, fixed-budget KRLS, and KNLMS are presented. The architecture utilizes pipelining and vectorization for performance, and microcoding for reusability. The design can be scaled to allow tradeoffs between capacity, performance, and area. The design is compared with a central processing unit (CPU), digital signal processor (DSP), and Altera OpenCL implementations. In different configurations on an Altera Arria 10 device, our SW-KRLS implementation delivers floating-point throughput of approximately 16 GFLOPs, latency of 5.5 $ \mu $ s, and energy consumption of $ 10^{- 4} $ J, these being improvements over a CPU by factors of 12, 17, and 24, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tang:2016:AKM, author = "Qing Y. Tang and Mohammed A. S. Khalid", title = "Acceleration of $k$-Means Algorithm Using {Altera SDK} for {OpenCL}", journal = j-TRETS, volume = "10", number = "1", pages = "6:1--6:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2964910", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A K-means clustering algorithm involves partitioning of data iteratively into k clusters. It is one of the most popular data-mining algorithms [Wu et al. 2007], and is widely used in other applications, such as image processing and machine learning. However, k-means is highly time-consuming when data or cluster size is large. Traditionally, FPGAs have shown great promise for accelerating computationally intensive algorithms, but they are harder to use for acceleration if we rely on traditional HD-based design methods. The recent introduction of Altera SDK for the OpenCL high-level synthesis tool allows developers to utilize FPGA's potential without long development periods and extensive hardware knowledge. This article presents an optimized implementation of a k-means clustering algorithm on an FPGA using Altera SDK for OpenCL. Performance and power consumption is measured with various data, cluster, and dimension sizes. When compared to state-of-the-art solutions, this implementation supports larger cluster sizes, offers up to 21x speed over a CPU and is more power efficient than a GPU. Unlike previous implementations, it can deliver consistently high throughput across large or small feature dimensions given reasonable cluster sizes and large enough data size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wong:2016:MCM, author = "Henry Wong and Vaughn Betz and Jonathan Rose", title = "Microarchitecture and Circuits for a {200 MHz} Out-of-Order Soft Processor Memory System", journal = j-TRETS, volume = "10", number = "1", pages = "7:1--7:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2974022", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Although FPGAs have grown in capacity, FPGA-based soft processors have grown very little because of the difficulty of achieving higher performance in exchange for area. Superscalar out-of-order processors promise large performance gains, and the memory subsystem is a key part of such a processor that must help supply increased performance. In this article, we describe and explore microarchitectural and circuit-level tradeoffs in the design of such a memory system. We show the significant instructions-per-cycle wins for providing various levels of out-of-order memory access and memory dependence speculation ($ 1.32 \times $ SPECint2000) and for the addition of a second-level cache (another $ 1.60 \times $ ). With careful microarchitecture and circuit design, we also achieve a L1 translation lookaside buffers and cache lookup with 29\% less logic delay than the simpler Nios II/f memory system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rouhani:2016:ART, author = "Bita Darvish Rouhani and Azalia Mirhoseini and Ebrahim M. Songhori and Farinaz Koushanfar", title = "Automated Real-Time Analysis of Streaming Big and Dense Data on Reconfigurable Platforms", journal = j-TRETS, volume = "10", number = "1", pages = "8:1--8:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2974023", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We propose SSketch, a novel automated framework for efficient analysis of dynamic big data with dense (non-sparse) correlation matrices on reconfigurable platforms. SSketch targets streaming applications where each data sample can be processed only once and storage is severely limited. Our framework adaptively learns from the stream of input data and updates a corresponding ensemble of lower-dimensional data structures, a.k.a., a sketch matrix. A new sketching methodology is introduced that tailors the problem of transforming the big data with dense correlations to an ensemble of lower-dimensional subspaces such that it is suitable for hardware-based acceleration performed by reconfigurable hardware. The new method is scalable, while it significantly reduces costly memory interactions and enhances matrix computation performance by leveraging coarse-grained parallelism existing in the dataset. SSketch provides an automated optimization methodology for creating the most accurate data sketch for a given set of user-defined constraints, including runtime and power as well as platform constraints such as memory. To facilitate automation, SSketch takes advantage of a Hardware/Software (HW/SW) co-design approach: It provides an Application Programming Interface that can be customized for rapid prototyping of an arbitrary matrix-based data analysis algorithm. Proof-of-concept evaluations on a variety of visual datasets with more than 11 million non-zeros demonstrate up to a 200-fold speedup on our hardware-accelerated realization of SSketch compared to a software-based deployment on a general-purpose processor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bourge:2016:GEC, author = "Alban Bourge and Olivier Muller and Fr{\'e}d{\'e}ric Rousseau", title = "Generating Efficient Context-Switch Capable Circuits through Autonomous Design Flow", journal = j-TRETS, volume = "10", number = "1", pages = "9:1--9:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2996199", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Apr 3 11:34:09 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Commercial off-the-shelf (COTS) Field-Programmable Gate Arrays (FPGAs) are becoming increasingly powerful. In addition to their huge hardware resources, they are also integrated into complete systems on chips (SOCs), e.g., in the latest Xilinx Zynq or Altera Stratix platforms. However, cooperation between FPGAs and their surroundings, and the flexibility of hardware task management could still be improved. For instance, mechanisms have yet to be automated to allow multi-user approaches. A reconfigurable resource can be shared between applications or users only if it has a context-switch ability allowing applications to be paused and resumed in response to system demands. Here, we present a high-level synthesis (HLS) design flow producing a context-switch-capable circuit. The design flow manipulates the intermediate representation of an HLS tool to build the context extraction mechanism and to optimize performance for the circuit produced. The method is based on efficient checkpoint selection and insertion of a powerful scan-chain into the initial circuit. This scan-chain can extract flip-flops or memory content. Experiments with the system produced show that it has a low hardware overhead for many benchmark applications, and that the hardware added has a negligible impact on application performance. Comparisons with current standard methods highlight the efficiency of our contributions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cardoso:2017:ISS, author = "Jo{\~a}o M. P. Cardoso and Cristina Silvano", title = "Introduction to the Special Section on {FPL 2015}", journal = j-TRETS, volume = "10", number = "2", pages = "10:1--10:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3041224", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kim:2017:SSC, author = "Jin Hee Kim and Jason H. Anderson", title = "Synthesizable Standard Cell {FPGA} Fabrics Targetable by the {Verilog}-to-Routing {CAD} Flow", journal = j-TRETS, volume = "10", number = "2", pages = "11:1--11:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3024063", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we consider implementing field-programmable gate arrays (FPGAs) using a standard cell design methodology and present a framework for the automated generation of synthesizable FPGA fabrics. The open-source Verilog-to-Routing (VTR) FPGA architecture evaluation framework [Rose et al. 2012] is extended to generate synthesizable Verilog for its in-memory FPGA architectural device model. The Verilog can subsequently be synthesized into standard cells, placed and routed using an ASIC design flow. A second extension to VTR generates a configuration bitstream for the FPGA, where the bitstream configures the FPGA to realize a user-provided placed and routed design. The proposed framework and methodology makes possible the silicon implementation of a wide range of VTR-modeled FPGA fabrics. In an experimental study, area and timing-optimized FPGA implementations in 65nm TSMC standard cells are compared to a 65nm Altera commercial FPGA. In addition, we consider augmenting the generic standard-cell library from TSMC with a manually designed and laid-out FPGA-specific cell. We demonstrate the utility of the custom cell in reducing the area of the synthesized FPGA fabric.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Burovskiy:2017:EAH, author = "Pavel Burovskiy and Paul Grigoras and Spencer Sherwin and Wayne Luk", title = "Efficient Assembly for High-Order Unstructured {FEM} Meshes {(FPL 2015)}", journal = j-TRETS, volume = "10", number = "2", pages = "12:1--12:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3024064", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The Finite Element Method (FEM) is a common numerical technique used for solving Partial Differential Equations on large and unstructured domain geometries. Numerical methods for FEM typically use algorithms and data structures which exhibit an unstructured memory access pattern. This makes acceleration of FEM on Field-Programmable Gate Arrays using an efficient, deeply pipelined architecture particularly challenging. In this work, we focus on implementing and optimising a vector assembly operation which, in the context of FEM, induces the unstructured memory access. We propose a dataflow architecture, graph-based theoretical model, and design flow for optimising the assembly operation for spectral/hp finite element method on reconfigurable accelerators. We evaluate the proposed approach on two benchmark meshes and show that the graph-theoretic method of generating a static data access schedule results in a significant improvement in resource utilisation compared to prior work. This enables supporting larger FEM meshes on FPGA than previously possible.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yang:2017:FSA, author = "Hsin-Jung Yang and Kermin Fleming and Felix Winterstein and Michael Adler and Joel Emer", title = "{(FPL 2015) Scavenger}: Automating the Construction of Application-Optimized Memory Hierarchies", journal = j-TRETS, volume = "10", number = "2", pages = "13:1--13:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3009971", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "High-level abstractions separate algorithm design from platform implementation, allowing programmers to focus on algorithms while building complex systems. This separation also provides system programmers and compilers an opportunity to optimize platform services on an application-by-application basis. In field-programmable gate arrays (FPGAs), platform-level malleability extends to the memory system: Unlike general-purpose processors, in which memory hardware is fixed at design time, the capacity, associativity, and topology of FPGA memory systems may all be tuned to improve application performance. Since application kernels may only explicitly use few memory resources, substantial memory capacity may be available to the platform for use on behalf of the user program. In this work, we present Scavenger, which utilizes spare resources to construct program-optimized memories, and we also perform an initial exploration of methods for automating the construction of these application-specific memory hierarchies. Although exploiting spare resources can be beneficial, na{\"\i}vely consuming all memory resources may cause frequency degradation. To relieve timing pressure in large block RAM (BRAM) structures, we provide microarchitectural techniques to trade memory latency for design frequency. We demonstrate, by examining a set of benchmarks, that our scalable cache microarchitecture achieves performance gains of 7\% to 74\% (with a 26\% geometric mean on average) over the baseline cache microarchitecture when scaling the size of first-level caches to the maximum.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kapre:2017:HDR, author = "Nachiket Kapre and Jan Gray", title = "{Hoplite}: a Deflection-Routed Directional Torus {NoC} for {FPGAs}", journal = j-TRETS, volume = "10", number = "2", pages = "14:1--14:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3027486", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We can design an FPGA-optimized lightweight network-on-chip (NoC) router for flit-oriented packet-switched communication that is an order of magnitude smaller (in terms of LUTs and FFs) than state-of-the-art FPGA overlay routers available today. We present Hoplite, an efficient, lightweight, and fast FPGA overlay NoC that is designed to be small and compact by (1) using deflection routing instead of buffered switching to eliminate expensive FIFO buffers and (2) using a torus topology to reduce the cost of switch crossbar. Buffering and crossbar implementation complexities have traditionally limited speeds and imposed heavy resource costs in conventional FPGA overlay NoCs. We take care to exploit the fracturable lookup tables (LUT) organization of the FPGA to further improve the resource efficiency of mapping the expensive crossbar multiplexers. Hoplite can outperform classic, bidirectional, buffered mesh networks for single-flit-oriented FPGA applications by as much as $ 1.5 \times $ (best achievable throughputs for a $ 10 \times 10 $ system) or $ 2.5 \times $ (allocating same amount of FPGA resources to both NoCs) for uniform random traffic. When compared to buffered mesh switches, FPGA-based deflection routers are $ \approx 3.5 \times $ smaller (HLS-generated switch) and $ 2.5 \times $ faster (clock period) for 32b payloads. In a separate experiment, we hand-crafted an RTL version of our switch with location constraints that requires only 60 LUTs and 100 FFs per router and runs at 2.9ns. We conduct additional layout experiments on modern Xilinx and Altera FPGAs and demonstrate wide-channel chip-spanning layouts that run in excess of 300MHz while consuming 10--15\% of overall chip resources. We also demonstrate a clustered RISC-V multiprocessor organization that uses Hoplite to help deliver the high processing throughputs of the FPGA architecture to user applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leong:2017:FYF, author = "Philip H. W. Leong and Hideharu Amano and Jason Anderson and Koen Bertels and Jo{\~a}o M. P. Cardoso and Oliver Diessel and Guy Gogniat and Mike Hutton and Junkyu Lee and Wayne Luk and Patrick Lysaght and Marco Platzner and Viktor K. Prasanna and Tero Rissa and Cristina Silvano and Hayden Kwok-Hay So and Yu Wang", title = "The First 25 Years of the {FPL} Conference: Significant Papers", journal = j-TRETS, volume = "10", number = "2", pages = "15:1--15:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2996468", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A summary of contributions made by significant papers from the first 25 years of the Field-Programmable Logic and Applications conference (FPL) is presented. The 27 papers chosen represent those which have most strongly influenced theory and practice in the field.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Takano:2017:PSA, author = "Shigeyuki Takano", title = "Performance Scalability of Adaptive Processor Architecture", journal = j-TRETS, volume = "10", number = "2", pages = "16:1--16:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3007902", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:01 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we evaluate the performance scalability of architectures called adaptive processors, which dynamically configure an application-specific pipelined datapath and perform a data-flow streaming execution. Previous works have examined the basics of the following: (1) a computational model that supports the swap-in/out of a partial datapath-namely, a virtual hardware is realized by hardware, without a host processor and its software; (2) an architecture that has shown a minimum pipeline requirement and a minimum component requirement; and (3) the characteristics of the execution phase and a stack shift that realizes the swap-in/out. However, these works did not explore the design space, particularly with respect to the following: (1) the clock cycle time on the adaptive processor, which must depend on a wire delay that is primarily used for the global communication of requests, acknowledgments, acquirements, releases, and so forth, and (2) a revised control system that can handle the out-of-order acknowledgment and in-order acquirement that guarantee the correct datapath configuration with a conditional branch for the configurations. This article explores the scaling of the ALU resources versus pipelining of the wires.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2017:TOF, author = "Zhiqiang Liu and Yong Dou and Jingfei Jiang and Jinwei Xu and Shijie Li and Yongmei Zhou and Yingnan Xu", title = "Throughput-Optimized {FPGA} Accelerator for Deep Convolutional Neural Networks", journal = j-TRETS, volume = "10", number = "3", pages = "17:1--17:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3079758", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Deep convolutional neural networks (CNNs) have gained great success in various computer vision applications. State-of-the-art CNN models for large-scale applications are computation intensive and memory expensive and, hence, are mainly processed on high-performance processors like server CPUs and GPUs. However, there is an increasing demand of high-accuracy or real-time object detection tasks in large-scale clusters or embedded systems, which requires energy-efficient accelerators because of the green computation requirement or the limited battery restriction. Due to the advantages of energy efficiency and reconfigurability, Field-Programmable Gate Arrays (FPGAs) have been widely explored as CNN accelerators. In this article, we present an in-depth analysis of computation complexity and the memory footprint of each CNN layer type. Then a scalable parallel framework is proposed that exploits four levels of parallelism in hardware acceleration. We further put forward a systematic design space exploration methodology to search for the optimal solution that maximizes accelerator throughput under the FPGA constraints such as on-chip memory, computational resources, external memory bandwidth, and clock frequency. Finally, we demonstrate the methodology by optimizing three representative CNNs (LeNet, AlexNet, and VGG-S) on a Xilinx VC709 board. The average performance of the three accelerators is 424.7, 445.6, and 473.4GOP/s under 100MHz working frequency, which outperforms the CPU and previous work significantly.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ueno:2017:BCF, author = "Tomohiro Ueno and Kentaro Sano and Satoru Yamamoto", title = "Bandwidth Compression of Floating-Point Numerical Data Streams for {FPGA}-Based High-Performance Computing", journal = j-TRETS, volume = "10", number = "3", pages = "18:1--18:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3053688", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Although computational performance is often limited by insufficient bandwidth to/from an external memory, it is not easy to physically increase off-chip memory bandwidth. In this study, we propose a hardware-based bandwidth compression technique that can be applied to field-programmable gate array-- (FPGA) based high-performance computation with a logically wider effective memory bandwidth. Our proposed hardware approach can boost the performance of FPGA-based stream computations by applying a data compression technique to effectively transfer more data streams. To apply this data compression technique to bandwidth compression via hardware, several requirements must first be satisfied, including an acceptable level of compression performance and a sufficiently small hardware footprint. Our proposed hardware-based bandwidth compressor utilizes an efficient prediction-based data compression algorithm. Moreover, we propose a multichannel serializer and deserializer that enable applications to use multiple channels of computational data with the bandwidth compression. The serializer encodes compressed data blocks of multiple channels into a data stream, which is efficiently written to an external memory. Based on preliminary evaluation, we define an encoding format considering both high compression ratio and small hardware area. As a result, we demonstrate that our area saving bandwidth compressor increases performance of an FPGA-based fluid dynamics simulation by deploying more processing elements to exploit spatial parallelism with the enhanced memory bandwidth.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Laforest:2017:MCM, author = "Charles Eric Laforest and Jason H. Anderson", title = "Microarchitectural Comparison of the {MXP} and {Octavo} Soft-Processor {FPGA} Overlays", journal = j-TRETS, volume = "10", number = "3", pages = "19:1--19:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3053679", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Field-Programmable Gate Arrays (FPGAs) can yield higher performance and lower power than software solutions on CPUs or GPUs. However, designing with FPGAs requires specialized hardware design skills and hours-long CAD processing times. To reduce and accelerate the design effort, we can implement an overlay architecture on the FPGA, on which we then more easily construct the desired system but at a large cost in performance and area relative to a direct FPGA implementation. In this work, we compare the micro-architecture, performance, and area of two soft-processor overlays: the Octavo multi-threaded soft-processor and the MXP soft vector processor. To measure the area and performance penalties of these overlays relative to the underlying FPGA hardware, we compare direct FPGA implementations of the micro-benchmarks written in C synthesized with the LegUp HLS tool and also written in the Verilog HDL. Overall, Octavo's higher operating frequency and MXP's more efficient code execution results in similar performance from both, within an order of magnitude of direct FPGA implementations, but with a penalty of an order of magnitude greater area.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gu:2017:IRF, author = "Chongyan Gu and Neil Hanley and M{\'a}ire O'neill", title = "Improved Reliability of {FPGA}-Based {PUF} Identification Generator Design", journal = j-TRETS, volume = "10", number = "3", pages = "20:1--20:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3053681", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Physical unclonable functions (PUFs), a form of physical security primitive, enable digital identifiers to be extracted from devices, such as field programmable gate arrays (FPGAs). Many PUF implementations have been proposed to generate these unique n -bit binary strings. However, they often offer insufficient uniqueness and reliability when implemented on FPGAs and can consume excessive resources. To address these problems, in this article we present an efficient, lightweight, and scalable PUF identification (ID) generator circuit that offers a compact design with good uniqueness and reliability properties and is specifically designed for FPGAs. A novel post-characterisation methodology is also proposed that improves the reliability of a PUF without the need for any additional hardware resources. Moreover, the proposed post-characterisation method can be generally used for any FPGA-based PUF designs. The PUF ID generator consumes 8.95\% of the hardware resources of a low-cost Xilinx Spartan-6 LX9 FPGA and 0.81\% of a Xilinx Artix-7 FPGA. Experimental results show good uniqueness, reliability, and uniformity with no occurrence of bit-aliasing. In particular, the reliability of the PUF is close to 100\% over an environmental temperature range of 25${}^\circ $C to 70${}^\circ $C with \pm 10\% variation in the supply voltage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Prost-Boucle:2017:EVF, author = "Adrien Prost-Boucle and Fr{\'e}d{\'e}ric P{\'e}trot and Vincent Leroy and Hande Alemdar", title = "Efficient and Versatile {FPGA} Acceleration of Support Counting for Stream Mining of Sequences and Frequent Itemsets", journal = j-TRETS, volume = "10", number = "3", pages = "21:1--21:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3027485", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Stream processing has become extremely popular for analyzing huge volumes of data for a variety of applications, including IoT, social networks, retail, and software logs analysis. Streams of data are produced continuously and are mined to extract patterns characterizing the data. A class of data mining algorithm, called generate-and-test, produces a set of candidate patterns that are then evaluated over data. The main challenges of these algorithms are to achieve high throughput, low latency, and reduced power consumption. In this article, we present a novel power-efficient, fast, and versatile hardware architecture whose objective is to monitor a set of target patterns to maintain their frequency over a stream of data. This accelerator can be used to accelerate data-mining algorithms, including itemsets and sequences mining. The massive fine-grain reconfiguration capability of field-programmable gate array (FPGA) technologies is ideal to implement the high number of pattern-detection units needed for these intensive data-mining applications. We have thus designed and implemented an IP that features high-density FPGA occupation and high working frequency. We provide detailed description of the IP internal micro-architecture and its actual implementation and optimization for the targeted FPGA resources. We validate our architecture by developing a co-designed implementation of the Apriori Frequent Itemset Mining (FIM) algorithm, and perform numerous experiments against existing hardware and software solutions. We demonstrate that FIM hardware acceleration is particularly efficient for large and low-density datasets (i.e., long-tailed datasets). Our IP reaches a data throughput of 250 million items/s and monitors up to 11.6k patterns simultaneously, on a prototyping board that overall consumes 24W in the worst case. Furthermore, our hardware accelerator remains generic and can be integrated to other generate and test algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tili:2017:RPG, author = "Ilian Tili and Kalin Ovtcharov and J. Gregory Steffan", title = "Reducing the Performance Gap between Soft Scalar {CPUs} and Custom Hardware with {TILT}", journal = j-TRETS, volume = "10", number = "3", pages = "22:1--22:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3079757", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "By using resource sharing field-programmable gate array (FPGA) compute engines, we can reduce the performance gap between soft scalar CPUs and resource-intensive custom datapath designs. This article demonstrates that Thread- and Instruction-Level parallel Template architecture (TILT), a programmable FPGA-based horizontally microcoded compute engine designed to highly utilize floating point (FP) functional units (FUs), can improve significantly the average throughput of eight FP-intensive applications compared to a soft scalar CPU (similar to a FP-extended Nios). For eight benchmark applications, we show that: (i) a base TILT configuration having a single instance for each FU type can improve the performance over a soft scalar CPU by 15.8 $ \times $ , while requiring on average 26\% of the custom datapaths' area; (ii) selectively increasing the number of FUs can more than double TILT's average throughput, reducing the custom-datapath-throughput-gap from 576 $ \times $ to 14 $ \times $ ; and (iii) replicated instances of the most computationally dense TILT configuration that fit within the area of each custom datapath design can reduce the gap to 8.27 $ \times $ , while replicated instances of application-tuned configurations of TILT can reduce the custom-datapath-throughput-gap to an average of 5.22 $ \times $ , and up to 3.41 $ \times $ for the Matrix Multiply benchmark. Last, we present methods for design space reduction, and we correctly predict the computationally densest design for seven out of eight benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wulf:2017:OFP, author = "Nicholas Wulf and Alan D. George and Ann Gordon-Ross", title = "Optimizing {FPGA} Performance, Power, and Dependability with Linear Programming", journal = j-TRETS, volume = "10", number = "3", pages = "23:1--23:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3079756", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Field-programmable gate arrays (FPGA) are an increasingly attractive alternative to traditional microprocessor-based computing architectures in extreme-computing domains, such as aerospace and supercomputing. FPGAs offer several resource types that offer different tradeoffs between speed, power, and area, which make FPGAs highly flexible for varying application computational requirements. However, since an application's computational operations can map to different resource types, a major challenge in leveraging resource-diverse FPGAs is determining the optimal distribution of these operations across the device's available resources for varying FPGA devices, resulting in an extremely large design space. In order to facilitate fast design-space exploration, this article presents a method based on linear programming (LP) that determines the optimal operation distribution for a particular device and application with respect to performance, power, or dependability metrics. Our LP method is an effective tool for exploring early designs by quickly analyzing thousands of FPGAs to determine the best FPGA devices and operation distributions, which significantly reduces design time. We demonstrate our LP method's effectiveness with two case studies involving dot-product and distance-calculation kernels on a range of Virtex-5 FPGAs. Results show that our LP method selects optimal distributions of operations to within an average of 4\% of actual values.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Riebler:2017:EBB, author = "Heinrich Riebler and Michael Lass and Robert Mittendorf and Thomas L{\"o}cke and Christian Plessl", title = "Efficient Branch and Bound on {FPGAs} Using Work Stealing and Instance-Specific Designs", journal = j-TRETS, volume = "10", number = "3", pages = "24:1--24:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3053687", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Dec 23 10:23:02 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Branch and bound (B8B) algorithms structure the search space as a tree and eliminate infeasible solutions early by pruning subtrees that cannot lead to a valid or optimal solution. Custom hardware designs significantly accelerate the execution of these algorithms. In this article, we demonstrate a high-performance B8B implementation on FPGAs. First, we identify general elements of B8B algorithms and describe their implementation as a finite state machine. Then, we introduce workers that autonomously cooperate using work stealing to allow parallel execution and full utilization of the target FPGA. Finally, we explore advantages of instance-specific designs that target a specific problem instance to improve performance. We evaluate our concepts by applying them to a branch and bound problem, the reconstruction of corrupted AES keys obtained from cold-boot attacks. The evaluation shows that our work stealing approach is scalable with the available resources and provides speedups proportional to the number of workers. Instance-specific designs allow us to achieve an overall speedup of 47 $ \times $ compared to the fastest implementation of AES key reconstruction so far. Finally, we demonstrate how instance-specific designs can be generated just-in-time such that the provided speedups outweigh the additional time required for design synthesis.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gerlein:2017:NCA, author = "Eduardo A. Gerlein and T. M. Mcginnity and Ammar Belatreche and Sonya Coleman", title = "Network on Chip Architecture for Multi-Agent Systems in {FPGA}", journal = j-TRETS, volume = "10", number = "4", pages = "25:1--25:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3121112", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 29 07:28:53 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A system of interacting agents is, by definition, very demanding in terms of computational resources. Although multi-agent systems have been used to solve complex problems in many areas, it is usually very difficult to perform large-scale simulations in their targeted serial computing platforms. Reconfigurable hardware, in particular Field Programmable Gate Arrays devices, have been successfully used in High Performance Computing applications due to their inherent flexibility, data parallelism, and algorithm acceleration capabilities. Indeed, reconfigurable hardware seems to be the next logical step in the agency paradigm, but only a few attempts have been successful in implementing multi-agent systems in these platforms. This article discusses the problem of inter-agent communications in Field Programmable Gate Arrays. It proposes a Network-on-Chip in a hierarchical star topology to enable agents' transactions through message broadcasting using the Open Core Protocol as an interface between hardware modules. A customizable router microarchitecture is described and a multi-agent system is created to simulate and analyse message exchanges in a generic heavy traffic load agent-based application. Experiments have shown a throughput of 1.6Gbps per port at 100MHz without packet loss and seamless scalability characteristics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fraser:2017:FIK, author = "Nicholas J. Fraser and Junkyu Lee and Duncan J. M. Moss and Julian Faraone and Stephen Tridgell and Craig T. Jin and Philip H. W. Leong", title = "{FPGA} Implementations of Kernel Normalised Least Mean Squares Processors", journal = j-TRETS, volume = "10", number = "4", pages = "26:1--26:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106744", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 29 07:28:53 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Kernel adaptive filters (KAFs) are online machine learning algorithms which are amenable to highly efficient streaming implementations. They require only a single pass through the data and can act as universal approximators, i.e. approximate any continuous function with arbitrary accuracy. KAFs are members of a family of kernel methods which apply an implicit non-linear mapping of input data to a high dimensional feature space, permitting learning algorithms to be expressed entirely as inner products. Such an approach avoids explicit projection into the feature space, enabling computational efficiency. In this paper, we propose the first fully pipelined implementation of the kernel normalised least mean squares algorithm for regression. Independent training tasks necessary for hyperparameter optimisation fill pipeline stages, so no stall cycles to resolve dependencies are required. Together with other optimisations to reduce resource utilisation and latency, our core achieves 161 GFLOPS on a Virtex 7 XC7VX485T FPGA for a floating point implementation and 211 GOPS for fixed point. Our PCI Express based floating-point system implementation achieves 80\% of the core's speed, this being a speedup of 10$ \times $ over an optimised implementation on a desktop processor and 2.66$ \times $ over a GPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chu:2017:FCA, author = "Thiem Van Chu and Shimpei Sato and Kenji Kise", title = "Fast and Cycle-Accurate Emulation of Large-Scale Networks-on-Chip Using a Single {FPGA}", journal = j-TRETS, volume = "10", number = "4", pages = "27:1--27:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151758", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 29 07:28:53 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Modeling and simulation/emulation play a major role in research and development of novel Networks-on-Chip (NoCs). However, conventional software simulators are so slow that studying NoCs for emerging many-core systems with hundreds to thousands of cores is challenging. State-of-the-art FPGA-based NoC emulators have shown great potential in speeding up the NoC simulation, but they cannot emulate large-scale NoCs due to the FPGA capacity constraints. Moreover, emulating large-scale NoCs under synthetic workloads on FPGAs typically requires a large amount of memory and thus involves the use of off-chip memory, which makes the overall design much more complicated and may substantially degrade the emulation speed. This article presents methods for fast and cycle-accurate emulation of NoCs with up to thousands of nodes using a single FPGA. We first describe how to emulate a NoC under a synthetic workload using only FPGA on-chip memory (BRAMs). We next present a novel use of time-division multiplexing where BRAMs are effectively used for emulating a network using a small number of nodes, thereby overcoming the FPGA capacity constraints. We propose methods for emulating both direct and indirect networks, focusing on the commonly used meshes and fat-trees ( k -ary n -trees). This is different from prior work that considers only direct networks. Using the proposed methods, we build a NoC emulator, called FNoC, and demonstrate the emulation of some mesh-based and fat-tree-based NoCs with canonical router architectures. Our evaluation results show that (1) the size of the largest NoC that can be emulated depends on only the FPGA on-chip memory capacity; (2) a mesh-based NoC with 16,384 nodes (128$ \times $128 NoC) and a fat-tree-based NoC with 6,144 switch nodes and 4,096 terminal nodes (4-ary 6-tree NoC) can be emulated using a single Virtex-7 FPGA; and (3) when emulating these two NoCs, we achieve, respectively, 5,047$ \times $ and 232$ \times $ speedups over BookSim, one of the most widely used software-based NoC simulators, while maintaining the same level of accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yoshimi:2017:PPJ, author = "Masato Yoshimi and Yasin Oge and Tsutomu Yoshinaga", title = "Pipelined Parallel Join and Its {FPGA}-Based Acceleration", journal = j-TRETS, volume = "10", number = "4", pages = "28:1--28:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3079759", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 29 07:28:53 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "A huge amount of data is being generated and accumulated in data centers, which leads to an important increase in the required energy consumption to analyze these data. Thus, we must consider the redesign of current computer systems architectures to be more friendly to applications based on distributed algorithms that require a high data transfer rate. Novel computer architectures that introduce dedicated accelerators to enable near-data processing have been discussed and developed for high-speed big-data analysis. In this work, we propose a computer system with an FPGA-based accelerator, namely, interconnected-FPGAs, which offers two advantages: (1) direct data transmission and (2) offloading computation into data-flow in the FPGA. In this article, we demonstrate the capability of the proposed interconnected-FPGAs system to accelerate join operations in a relational database. We developed a new parallel join algorithm, PPJoin, targeted to big-data analysis in a shared-nothing architecture. PPJoin is an extended version of the NUMA-based parallel join algorithm, created by overlapping computation by multicore processors and data communication. The data communication between computational nodes can be accelerated by direct data transmission without passing through the main memory of the hosts. To confirm the performance of the PPJoin algorithm and its acceleration process using an interconnected-FPGA platform, we evaluated a simple query for large tables. Additionally, to support availability, we also evaluated the actual benchmark query. Our evaluation results confirm that the PPJoin algorithm is faster than a software-based query engine by 1.5--5 times. Moreover, we experimentally confirmed that the direct data transmission by interconnected FPGAs reduces computational time around 20\% for PPJoin.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fabry:2017:ERA, author = "Pieter Fabry and David Thomas", title = "Efficient Reconfigurable Architecture for Pricing Exotic Options", journal = j-TRETS, volume = "10", number = "4", pages = "29:1--29:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3158228", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 29 07:28:53 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This article presents a new method for Monte Carlo (MC) option pricing using field-programmable gate arrays (FPGAs), which use a discrete-space random walk over a binomial lattice, rather than the continuous space-walks used by existing approaches. The underlying hypothesis is that the discrete-space walk will significantly reduce the area needed for each MC engine, and the resulting increase in parallelisation and raw performance outweighs any accuracy losses introduced by the discretisation. Experimental results support this hypothesis, showing that for a given MC simulation size, there is no significant loss in accuracy by using a discrete space model for the path-dependent exotic financial options. Analysis of the binomial simulation model shows that only limited-precision fixed-point arithmetic is needed, and also shows that pairs of MC kernels are able to share RAM resources. When using realistic constraints on pricing problems, it was found that the size of a discrete-space MC engine can be kept to 370 Flip-Flops and 233 Lookup Tables, allowing up to 3,000 variance-reduced MC cores in one FPGA. The combination of a highly parallelisable architecture and model-specific optimisations means that the binomial pricing technique allows for a 50$ \times $ improvement in throughput compared to existing FPGA approaches, without any reduction in accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bakos:2018:ISS, author = "Jason D. Bakos", title = "Introduction to the Special Section on {FCCM'16}", journal = j-TRETS, volume = "11", number = "1", pages = "1:1--1:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183572", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1e", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wong:2018:HPI, author = "Henry Wong and Vaughn Betz and Jonathan Rose", title = "High-Performance Instruction Scheduling Circuits for Superscalar Out-of-Order Soft Processors", journal = j-TRETS, volume = "11", number = "1", pages = "1:1--1:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3093741", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Soft processors have a role to play in simplifying field-programmable gate array (FPGA) application design as they can be deployed only when needed, and it is easier to write and debug single-threaded software code than create hardware. The breadth of this second role increases when the performance of the soft processor increases, yet the sophisticated out-of-order superscalar approaches that arrived in the mid-1990s are not employed, despite their area cost now being easily tolerable. In this article, we take an important step toward out-of-order execution in soft processors by exploring instruction scheduling in an FPGA substrate. This differs from the hard-processor design problem because the logic substrate is restricted to LUTs, whereas hard processor scheduling circuits employ CAM and wired-OR structures to great benefit. We discuss both circuit and microarchitectural trade-offs and compare three circuit structures for the scheduler, including a new structure called a fused-logic matrix scheduler. Using our optimized circuits, we show that four-issue distributed schedulers with up to 54 entries can be built with the same cycle time as the commercial Nios II/f soft processor (240MHz). This careful design has the potential to significantly increase both the IPC and raw compute performance of a soft processor, compared to current commercial soft processors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Davis:2018:KHA, author = "James J. Davis and Eddie Hung and Joshua M. Levine and Edward A. Stott and Peter Y. K. Cheung and George A. Constantinides", title = "{KAPow}: High-Accuracy, Low-Overhead Online Per-Module Power Estimation for {FPGA} Designs", journal = j-TRETS, volume = "11", number = "1", pages = "2:1--2:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3129789", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In an FPGA system-on-chip design, it is often insufficient to merely assess the power consumption of the entire circuit by compile-time estimation or runtime power measurement. Instead, to make better decisions, one must understand the power consumed by each module in the system. In this work, we combine measurements of register-level switching activity and system-level power to build an adaptive online model that produces live breakdowns of power consumption within the design. Online model refinement avoids time-consuming characterization while also allowing the model to track long-term operating condition changes. Central to our method is an automated flow that selects signals predicted to be indicative of high power consumption, instrumenting them for monitoring. We named this technique KAPow, for `K'ounting Activity for Power estimation, which we show to be accurate and to have low overheads across a range of representative benchmarks. We also propose a strategy allowing for the identification and subsequent elimination of counters found to be of low significance at runtime, reducing algorithmic complexity without sacrificing significant accuracy. Finally, we demonstrate an application example in which a module-level power breakdown can be used to determine an efficient mapping of tasks to modules and reduce system-wide power consumption by up to 7\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Giesen:2018:COS, author = "Hans Giesen and Benjamin Gojman and Raphael Rubin and Ji Kim and Andr{\'e} Dehon", title = "Continuous Online Self-Monitoring Introspection Circuitry for Timing Repair by Incremental Partial-Reconfiguration {(COSMIC TRIP)}", journal = j-TRETS, volume = "11", number = "1", pages = "3:1--3:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3158229", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We show that continuously monitoring on-chip delays at the LUT-to-LUT link level during operation allows a field-programmable gate array to detect and self-adapt to aging and environmental timing effects. Using a lightweight ($ < 4 \% $ added area) mechanism for monitoring transition timing, a Difference Detector with First-Fail Latch, we can estimate the timing margin on circuits and identify the individual links that have degraded and whose delay is determining the worst-case circuit delay. Combined with Choose-Your-own-Adventure precomputed, fine-grained repair alternatives, we introduce a strategy for rapid, in-system incremental repair of links with degraded timing. We show that these techniques allow us to respond to a single aging event in less than 190ms for the toronto20 benchmarks. The result is a step toward systems where adaptive reconfiguration on the time-scale of seconds is viable and beneficial.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhao:2018:FGM, author = "Zhuoran Zhao and Nguyen T. H. Nguyen and Dimitris Agiakatsikas and Ganghee Lee and Ediz Cetin and Oliver Diessel", title = "Fine-Grained Module-Based Error Recovery in {FPGA}-Based {TMR} Systems", journal = j-TRETS, volume = "11", number = "1", pages = "4:1--4:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3173549", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Space processing applications deployed on SRAM-based Field Programmable Gate Arrays (FPGAs) are vulnerable to radiation-induced Single Event Upsets (SEUs). Compared with the well-known SEU mitigation solution-Triple Modular Redundancy (TMR) with configuration memory scrubbing-TMR with module-based error recovery (MER) is notably more energy efficient and responsive in repairing soft-errors in the system. Unfortunately, TMR-MER systems also need to resort to scrubbing when errors occur between sub-components, such as in interconnection nets, which are not recovered by MER. This article addresses this problem by proposing a fine-grained module-based error recovery technique, which can localize and correct errors that classic MER fails to do without additional system hardware. We evaluate our proposal via fault-injection campaigns on three types of circuits implemented in Xilinx 7-Series devices. With respect to scrubbing, we observed reductions in the mean time to repair configuration memory errors of between 48.5\% and 89.4\%, while reductions in energy used recovering from configuration memory errors were estimated at between 77.4\% and 96.1\%. These improvements result in higher reliability for systems employing TMR with fine-grained reconfiguration than equivalent systems relying on scrubbing for configuration error recovery.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{AlKadi:2018:GPC, author = "Muhammed {Al Kadi} and Benedikt Janssen and Jones Yudi and Michael Huebner", title = "General-Purpose Computing with Soft {GPUs} on {FPGAs}", journal = j-TRETS, volume = "11", number = "1", pages = "5:1--5:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3173548", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Using field-programmable gate arrays (FPGAs) as a substrate to deploy soft graphics processing units (GPUs) would enable offering the FPGA compute power in a very flexible GPU-like tool flow. Application-specific adaptations like selective hardening of floating-point operations and instruction set subsetting would mitigate the high area and power demands of soft GPUs. This work explores the capabilities and limitations of soft General Purpose Computing on GPUs (GPGPU) for both fixed- and floating point arithmetic. For this purpose, we have developed FGPU: a configurable, scalable, and portable GPU architecture designed especially for FPGAs. FGPU is open-source and implemented entirely in RTL. It can be programmed in OpenCL and controlled through a Python API. This article introduces its hardware architecture as well as its tool flow. We evaluated the proposed GPGPU approach against multiple other solutions. In comparison to homogeneous Multi-Processor System-On-Chips (MPSoCs), we found that using a soft GPU is a Pareto-optimal solution regarding throughput per area and energy consumption. On average, FGPU has a 2.9$ \times $ better compute density and 11.2$ \times $ less energy consumption than a single MicroBlaze processor when computing in IEEE-754 floating-point format. An average speedup of about 4$ \times $ over the ARM Cortex-A9 supported with the NEON vector co-processor has been measured for fixed- or floating-point benchmarks. In addition, the biggest FGPU cores we could implement on a Xilinx Zynq-7000 System-On-Chip (SoC) can deliver similar performance to equivalent implementations with High-Level Synthesis (HLS).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tatsumura:2018:EFM, author = "Kosuke Tatsumura and Sadegh Yazdanshenas and Vaughn Betz", title = "Enhancing {FPGAs} with Magnetic Tunnel Junction-Based Block {RAMs}", journal = j-TRETS, volume = "11", number = "1", pages = "6:1--6:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3154425", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "While plentiful on-chip memory is necessary for many designs to fully utilize an FPGA's computational capacity, SRAM scaling is becoming more difficult because of increasing device variation. An alternative is to build FPGA block RAM (BRAM) from magnetic tunnel junctions (MTJ), as this emerging embedded memory has a small cell size, low energy usage, and good scalability. We conduct a detailed comparison study of SRAM and MTJ BRAMs that includes cell designs that are robust with device variation, transistor-level design and optimization of all the required BRAM-specific circuits, and variation-aware simulation at the 22nm node. At a 256Kb block size, MTJ-BRAM is 3.06$ \times $ denser and 55\% more energy efficient and its F$_{max}$ is 274MHz, which is adequate for most FPGA system clock domains. We also detail further enhancements that allow these 256 Kb MTJ BRAMs to operate at a higher speed of 353MHz for the streaming FIFOs, which are very common in FPGA designs and describe how the non-volatility of MTJ BRAM enables novel on-chip configuration and power-down modes. For a RAM architecture similar to the latest commercial FPGAs, MTJ-BRAMs could expand FPGA memory capacity by 2.95$ \times $ with no die size increase.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Stewart:2018:RPI, author = "Robert Stewart and Kirsty Duncan and Greg Michaelson and Paulo Garcia and Deepayan Bhowmik and Andrew Wallace", title = "{RIPL}: a Parallel Image Processing Language for {FPGAs}", journal = j-TRETS, volume = "11", number = "1", pages = "7:1--7:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3180481", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Specialized FPGA implementations can deliver higher performance and greater power efficiency than embedded CPU or GPU implementations for real-time image processing. Programming challenges limit their wider use, because the implementation of FPGA architectures at the register transfer level is time consuming and error prone. Existing software languages supported by high-level synthesis (HLS), although providing a productivity improvement, are too general purpose to generate efficient hardware without the use of hardware-specific code optimizations. Such optimizations leak hardware details into the abstractions that software languages are there to provide, and they require knowledge of FPGAs to generate efficient hardware, such as by using language pragmas to partition data structures across memory blocks. This article presents a thorough account of the Rathlin image processing language (RIPL), a high-level image processing domain-specific language for FPGAs. We motivate its design, based on higher-order algorithmic skeletons, with requirements from the image processing domain. RIPL's skeletons suffice to elegantly describe image processing stencils, as well as recursive algorithms with nonlocal random access patterns. At its core, RIPL employs a dataflow intermediate representation. We give a formal account of the compilation scheme from RIPL skeletons to static and cyclostatic dataflow models to describe their data rates and static scheduling on FPGAs. RIPL compares favorably to the Vivado HLS OpenCV library and C++ compiled with Vivado HLS. RIPL achieves between 54 and 191 frames per second (FPS) at 100MHz for four synthetic benchmarks, faster than HLS OpenCV in three cases. Two real-world algorithms are implemented in RIPL: visual saliency and mean shift segmentation. For the visual saliency algorithm, RIPL achieves 71 FPS compared to optimized C++ at 28 FPS. RIPL is also concise, being 5x shorter than C++ and 111x shorter than an equivalent direct dataflow implementation. For mean shift segmentation, RIPL achieves 7 FPS compared to optimized C++ on 64 CPU cores at 1.1, and RIPL is 10x shorter than the direct dataflow FPGA implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Khan:2018:EAM, author = "Farheen Fatima Khan and Andy Ye", title = "An Evaluation on the Accuracy of the Minimum-Width Transistor Area Models in Ranking the Layout Area of {FPGA} Architectures", journal = j-TRETS, volume = "11", number = "1", pages = "8:1--8:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3182394", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:42:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "This work provides an evaluation on the accuracy of the minimum-width transistor area models in ranking the actual layout area of FPGA architectures. Both the original VPR area model and the new COFFE area model are compared against the actual layouts with up to three metal layers for the various FPGA building blocks. We found that both models have significant variations with respect to the accuracy of their predictions across the building blocks. In particular, the original VPR model overestimates the layout area of larger buffers, full adders, and multiplexers by as much as 38\%, while they underestimate the layout area of smaller buffers and multiplexers by as much as 58\%, for an overall prediction error variation of 96\%. The newer COFFE model also significantly overestimates the layout area of full adders by 13\% and underestimates the layout area of multiplexers by a maximum of 60\% for a prediction error variation of 73\%. Such variations are particularly significant considering sensitivity analyses are not routinely performed in FPGA architectural studies. Our results suggest that such analyses are extremely important in studies that employ the minimum-width area models so the tolerance of the architectural conclusions against the prediction error variations can be quantified. Furthermore, an open-source version of the layouts of the actual FPGA building blocks should be created so their actual layout area can be used to achieve a highly accurate ranking of the implementation area of FPGA architectures built upon these layouts.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wijesundera:2018:FRP, author = "Deshya Wijesundera and Alok Prakash and Thambipillai Srikanthan and Achintha Ihalage", title = "Framework for Rapid Performance Estimation of Embedded Soft Core Processors", journal = j-TRETS, volume = "11", number = "2", pages = "9:1--9:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3195801", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The large number of embedded soft core processors available today make it tedious and time consuming to select the best processor for a given application. This task is even more challenging due to the numerous configuration options available for a single soft core processor while optimizing for contradicting design requirements such as performance and area. In this article, we propose a generic framework for rapid performance estimation of applications on soft core processors. The proposed technique is scalable to the large number of configuration options available in modern soft core processors by relying on rapid and accurate estimation models instead of time-consuming FPGA synthesis and execution-based techniques. Experimental results on two leading commercial soft core processors executing applications from the widely used CHStone benchmark suite show an average error of less than 6\% while running in the order of minutes when compared to hours taken by synthesis-based techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rossi:2018:PPR, author = "Enrico Rossi and Marvin Damschen and Lars Bauer and Giorgio Buttazzo and J{\"o}rg Henkel", title = "Preemption of the Partial Reconfiguration Process to Enable Real-Time Computing With {FPGAs}", journal = j-TRETS, volume = "11", number = "2", pages = "10:1--10:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3182183", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "To improve computing performance in real-time applications, modern embedded platforms comprise hardware accelerators that speed up the task's most compute-intensive parts. A recent trend in the design of real-time embedded systems is to integrate field-programmable gate arrays (FPGA) that are reconfigured with different accelerators at runtime, to cope with dynamic workloads that are subject to timing constraints. One of the major limitations when dealing with partial FPGA reconfiguration in real-time systems is that the reconfiguration port can only perform one reconfiguration at a time: if a high-priority task issues a reconfiguration request while the reconfiguration port is already occupied by a lower-priority task, the high-priority task has to wait until the current reconfiguration is completed (a phenomenon known as priority inversion ), unless the current reconfiguration is aborted (introducing unbounded delays in low-priority tasks, a phenomenon known as starvation ). This article shows how priority inversion and starvation can be solved by making the reconfiguration process preemptive -that is, allowing it to be interrupted at any time and resumed at a later time without restarting it from scratch. Such a feature is crucial for the design of runtime reconfigurable real-time systems but not yet available in today's platforms. Furthermore, the trade-off of achieving a guaranteed bound on the reconfiguration delay for low-priority tasks and the maximum delay induced for high-priority tasks when preempting an ongoing reconfiguration has been identified and analyzed. Experimental results on the Xilinx Zynq-7000 platform show that the proposed implementation of preemptive reconfiguration introduces a low runtime overhead, thus effectively solving priority inversion and starvation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Petelin:2018:WEF, author = "Oleg Petelin and Vaughn Betz", title = "{Wotan}: Evaluating {FPGA} Architecture Routability without Benchmarks", journal = j-TRETS, volume = "11", number = "2", pages = "11:1--11:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3195800", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "FPGA routing architectures consist of routing wires and programmable switches that together account for the majority of the fabric delay and area, making evaluation and optimization of an FPGA's routing architecture very important. Routing architectures have traditionally been evaluated using a full synthesize, pack, place and route CAD flow over a suite of benchmark circuits. While the results are accurate, a full CAD flow has a long runtime and is often tuned to a specific FPGA architecture type, which limits exploration of different architecture options early in the design process. In this article, we present Wotan, a tool to quickly estimate routability for a wide range of architectures without the use of benchmark circuits. At its core, our routability predictor efficiently counts paths through the FPGA routing graph to (1) estimate the probability of node congestion and (2) estimate the probabilities to successfully route a randomized subset of (source, sink) pairs, which are then combined into an overall routability metric. We describe our predictor and present routability estimates for a range of 6-LUT and 4-LUT architectures using mixes of wire types connected in complex ways, showing a rank correlation of 0.91 with routability results from the full VPR CAD flow while requiring 18$ \times $ less CPU effort.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Anandakumar:2018:RHA, author = "N. Nalla Anandakumar and M. Prem Laxman Das and Somitra K. Sanadhya and Mohammad S. Hashmi", title = "Reconfigurable Hardware Architecture for Authenticated Key Agreement Protocol Over Binary {Edwards} Curve", journal = j-TRETS, volume = "11", number = "2", pages = "12:1--12:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3231743", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In this article, we present a high-performance hardware architecture for Elliptic curve based (authenticated) key agreement protocol ``Elliptic Curve Menezes, Qu and Vanstone'' (ECMQV) over Binary Edwards Curve (BEC). We begin by analyzing inversion module on a 251-bit binary field. Subsequently, we present Field Programmable Gate Array (FPGA) implementations of the unified formula for computing elliptic curve point addition on BEC in affine and projective coordinates and investigate the relative performance of these two coordinates. Then, we implement the w -coordinate based differential addition formulae suitable for usage in Montgomery ladder. Next, we present a novel hardware architecture of BEC point multiplication using mixed w -coordinates of the Montgomery laddering algorithm and analyze it in terms of resistance to Simple Power Analysis (SPA) attack. In order to improve the performance, the architecture utilizes registers efficiently and uses efficient scheduling mechanisms for the BEC arithmetic implementations. Our implementation results show that the proposed architecture is resistant against SPA attack and yields a better performance when compared to the existing state-of-the-art BEC designs for computing point multiplication (PM). Finally, we present an FPGA design of ECMQV key agreement protocol using BEC defined over GF(2$^{251}$ ). The execution of ECMQV protocol takes 66.47 $ \mu $ s using 32,479 slices on Virtex-4 FPGA and 52.34 $ \mu $ s using 15,988 slices on Virtex-5 FPGA. To the best of our knowledge, this is the first FPGA design of the ECMQV protocol using BEC.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Daigneault:2018:ASS, author = "Marc-Andre Daigneault and Jean Pierre David", title = "Automated Synthesis of Streaming Transfer Level Hardware Designs", journal = j-TRETS, volume = "11", number = "2", pages = "13:1--13:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243930", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "As modern field-programmable gate arrays (FPGA) enable high computing performance and efficiency, their programming with low-level hardware description languages is time-consuming and remains a major obstacle to their adoption. High-level synthesis compilers are able to produce register-transfer-level (RTL) designs from C/C++ algorithmic descriptions, but despite allowing significant design-time improvements, these tools are not always able to generate hardware designs that compare to handmade RTL designs. In this article, we consider synthesis from an intermediate-level (IL) language that allows the description of algorithmic state machines handling connections between streaming sources and sinks. However, the interconnection of streaming sources and sinks can lead to cyclic combinational relations, resulting in undesirable behaviors or un-synthesizable designs. We propose a functional-level methodology to automate the resolution of such cyclic relations into acyclic combinational functions. The proposed IL synthesis methodology has been applied to the design of pipelined floating-point cores. The results obtained show how the proposed IL methodology can simplify the description of pipelined architectures while enabling performances that are close to those achievable through an RTL design methodology.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2018:ISS, author = "Deming Chen and Andrew Putnam and Steve Wilton", title = "Introduction to the Special Section on Deep Learning in {FPGAs}", journal = j-TRETS, volume = "11", number = "3", pages = "14:1--14:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3294768", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Prost-Boucle:2018:HEC, author = "Adrien Prost-Boucle and Alban Bourge and Fr{\'e}d{\'e}ric P{\'e}trot", title = "High-Efficiency Convolutional Ternary Neural Networks with Custom Adder Trees and Weight Compression", journal = j-TRETS, volume = "11", number = "3", pages = "15:1--15:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3270764", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Although performing inference with artificial neural networks (ANN) was until quite recently considered as essentially compute intensive, the emergence of deep neural networks coupled with the evolution of the integration technology transformed inference into a memory bound problem. This ascertainment being established, many works have lately focused on minimizing memory accesses, either by enforcing and exploiting sparsity on weights or by using few bits for representing activations and weights, to be able to use ANNs inference in embedded devices. In this work, we detail an architecture dedicated to inference using ternary {-1, 0, 1} weights and activations. This architecture is configurable at design time to provide throughput vs. power trade-offs to choose from. It is also generic in the sense that it uses information drawn for the target technologies (memory geometries and cost, number of available cuts, etc.) to adapt at best to the FPGA resources. This allows to achieve up to 5.2k frames per second per Watt for classification on a VC709 board using approximately half of the resources of the FPGA.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Blott:2018:FRE, author = "Michaela Blott and Thomas B. Preu{\ss}er and Nicholas J. Fraser and Giulio Gambardella and Kenneth O'brien and Yaman Umuroglu and Miriam Leeser and Kees Vissers", title = "{FINN-R}: an End-to-End Deep-Learning Framework for Fast Exploration of Quantized Neural Networks", journal = j-TRETS, volume = "11", number = "3", pages = "16:1--16:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242897", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Convolutional Neural Networks have rapidly become the most successful machine-learning algorithm, enabling ubiquitous machine vision and intelligent decisions on even embedded computing systems. While the underlying arithmetic is structurally simple, compute and memory requirements are challenging. One of the promising opportunities is leveraging reduced-precision representations for inputs, activations, and model parameters. The resulting scalability in performance, power efficiency, and storage footprint provides interesting design compromises in exchange for a small reduction in accuracy. FPGAs are ideal for exploiting low-precision inference engines leveraging custom precisions to achieve the required numerical accuracy for a given application. In this article, we describe the second generation of the FINN framework, an end-to-end tool that enables design-space exploration and automates the creation of fully customized inference engines on FPGAs. Given a neural network description, the tool optimizes for given platforms, design targets, and a specific precision. We introduce formalizations of resource cost functions and performance predictions and elaborate on the optimization algorithms. Finally, we evaluate a selection of reduced precision neural networks ranging from CIFAR-10 classifiers to YOLO-based object detection on a range of platforms including PYNQ and AWS F1, demonstrating new unprecedented measured throughput at 50 TOp/s on AWS F1 and 5 TOp/s on embedded devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ding:2018:LLH, author = "Ruizhou Ding and Zeye Liu and R. D. (Shawn) Blanton and Diana Marculescu", title = "Lightening the Load with Highly Accurate Storage- and Energy-Efficient {LightNNs}", journal = j-TRETS, volume = "11", number = "3", pages = "17:1--17:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3270689", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Hardware implementations of deep neural networks (DNNs) have been adopted in many systems because of their higher classification speed. However, while they may be characterized by better accuracy, larger DNNs require significant energy and area, thereby limiting their wide adoption. The energy consumption of DNNs is driven by both memory accesses and computation. Binarized neural networks (BNNs), as a tradeoff between accuracy and energy consumption, can achieve great energy reduction and have good accuracy for large DNNs due to their regularization effect. However, BNNs show poor accuracy when a smaller DNN configuration is adopted. In this article, we propose a new DNN architecture, LightNN, which replaces the multiplications to one shift or a constrained number of shifts and adds. Our theoretical analysis for LightNNs shows that their accuracy is maintained while dramatically reducing storage and energy requirements. For a fixed DNN configuration, LightNNs have better accuracy at a slight energy increase than BNNs, yet are more energy efficient with only slightly less accuracy than conventional DNNs. Therefore, LightNNs provide more options for hardware designers to trade off accuracy and energy. Moreover, for large DNN configurations, LightNNs have a regularization effect, making them better in accuracy than conventional DNNs. These conclusions are verified by experiment using the MNIST and CIFAR-10 datasets for different DNN configurations. Our FPGA implementation for conventional DNNs and LightNNs confirms all theoretical and simulation results and shows that LightNNs reduce latency and use fewer FPGA resources compared to conventional DNN architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Meloni:2018:NEC, author = "Paolo Meloni and Alessandro Capotondi and Gianfranco Deriu and Michele Brian and Francesco Conti and Davide Rossi and Luigi Raffo and Luca Benini", title = "{NEURAghe}: Exploiting {CPU--FPGA} Synergies for Efficient and Flexible {CNN} Inference Acceleration on {Zynq SoCs}", journal = j-TRETS, volume = "11", number = "3", pages = "18:1--18:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3284357", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Deep convolutional neural networks (CNNs) obtain outstanding results in tasks that require human-level understanding of data, like image or speech recognition. However, their computational load is significant, motivating the development of CNN-specialized accelerators. This work presents NEURA ghe, a flexible and efficient hardware/software solution for the acceleration of CNNs on Zynq SoCs. NEURAghe leverages the synergistic usage of Zynq ARM cores and of a powerful and flexible Convolution-Specific Processor deployed on the reconfigurable logic. The Convolution-Specific Processor embeds both a convolution engine and a programmable soft core, releasing the ARM processors from most of the supervision duties and allowing the accelerator to be controlled by software at an ultra-fine granularity. This methodology opens the way for cooperative heterogeneous computing: While the accelerator takes care of the bulk of the CNN workload, the ARM cores can seamlessly execute hard-to-accelerate parts of the computational graph, taking advantage of the NEON vector engines to further speed up computation. Through the companion NeuDNN SW stack, NEURAghe supports end-to-end CNN-based classification with a peak performance of 169GOps/s, and an energy efficiency of 17GOps/W. Thanks to our heterogeneous computing model, our platform improves upon the state-of-the-art, achieving a frame rate of 5.5 frames per second (fps) on the end-to-end execution of VGG-16 and 6.6fps on ResNet-18.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2018:OCB, author = "Shuanglong Liu and Hongxiang Fan and Xinyu Niu and Ho-cheung Ng and Yang Chu and Wayne Luk", title = "Optimizing {CNN}-based Segmentation with Deeply Customized Convolutional and Deconvolutional Architectures on {FPGA}", journal = j-TRETS, volume = "11", number = "3", pages = "19:1--19:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242900", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Convolutional Neural Networks-- (CNNs) based algorithms have been successful in solving image recognition problems, showing very large accuracy improvement. In recent years, deconvolution layers are widely used as key components in the state-of-the-art CNNs for end-to-end training and models to support tasks such as image segmentation and super resolution. However, the deconvolution algorithms are computationally intensive, which limits their applicability to real-time applications. Particularly, there has been little research on the efficient implementations of deconvolution algorithms on FPGA platforms that have been widely used to accelerate CNN algorithms by practitioners and researchers due to their high performance and power efficiency. In this work, we propose and develop deconvolution architecture for efficient FPGA implementation. FPGA-based accelerators are proposed for both deconvolution and CNN algorithms. Besides, memory sharing between the computation modules is proposed for the FPGA-based CNN accelerator as well as for other optimization techniques. A non-linear optimization model based on the performance model is introduced to efficiently explore the design space to achieve optimal processing speed of the system and improve power efficiency. Furthermore, a hardware mapping framework is developed to automatically generate the low-latency hardware design for any given CNN model on the target device. Finally, we implement our designs on Xilinx Zynq ZC706 board and the deconvolution accelerator achieves a performance of 90.1 giga operations per second (GOPS) under 200MHz working frequency and a performance density of 0.10 GOPS/DSP using 32-bit quantization, which significantly outperforms previous designs on FPGAs. A real-time application of scene segmentation on Cityscapes Dataset is used to evaluate our CNN accelerator on Zynq ZC706 board, and the system achieves a performance of 107 GOPS and 0.12 GOPS/DSP using 16-bit quantization and supports up to 17 frames per second for 512 $ \times $ 512 image inputs with a power consumption of only 9.6W.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Boutros:2018:YCI, author = "Andrew Boutros and Sadegh Yazdanshenas and Vaughn Betz", title = "You Cannot Improve What You Do not Measure: {FPGA} vs. {ASIC} Efficiency Gaps for Convolutional Neural Network Inference", journal = j-TRETS, volume = "11", number = "3", pages = "20:1--20:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242898", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Recently, deep learning (DL) has become best-in-class for numerous applications but at a high computational cost that necessitates high-performance energy-efficient acceleration. The reconfigurability of FPGAs is appealing due to the rapid change in DL models but also causes lower performance and area-efficiency compared to ASICs. In this article, we implement three state-of-the-art computing architectures (CAs) for convolutional neural network (CNN) inference on FPGAs and ASICs. By comparing the FPGA and ASIC implementations, we highlight the area and performance costs of programmability to pinpoint the inefficiencies in current FPGA architectures. We perform our experiments using three variations of these CAs for AlexNet, VGG-16 and ResNet-50 to allow extensive comparisons. We find that the performance gap varies significantly from 2.8$ \times $ to 6.3$ \times $, while the area gap is consistent across CAs with an 8.7 average FPGA-to-ASIC area ratio. Among different blocks of the CAs, the convolution engine, constituting up to 60\% of the total area, has a high area ratio ranging from 13 to 31. Motivated by our FPGA vs. ASIC comparisons, we suggest FPGA architectural changes such as increasing DSP block count, enhancing low-precision support in DSP blocks and rethinking the on-chip memories to reduce the programmability gap for DL applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rouhani:2018:RRT, author = "Bita Darvish Rouhani and Siam Umar Hussain and Kristin Lauter and Farinaz Koushanfar", title = "{ReDCrypt}: Real-Time Privacy-Preserving Deep Learning Inference in Clouds Using {FPGAs}", journal = j-TRETS, volume = "11", number = "3", pages = "21:1--21:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242899", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Artificial Intelligence (AI) is increasingly incorporated into the cloud business in order to improve the functionality (e.g., accuracy) of the service. The adoption of AI as a cloud service raises serious privacy concerns in applications where the risk of data leakage is not acceptable. Examples of such applications include scenarios where clients hold potentially sensitive private information such as medical records, financial data, and/or location. This article proposes ReDCrypt, the first reconfigurable hardware-accelerated framework that empowers privacy-preserving inference of deep learning models in cloud servers. ReDCrypt is well-suited for streaming (a.k.a., real-time AI) settings where clients need to dynamically analyze their data as it is collected over time without having to queue the samples to meet a certain batch size. Unlike prior work, ReDCrypt neither requires to change how AI models are trained nor relies on two non-colluding servers to perform. The privacy-preserving computation in ReDCrypt is executed using Yao's Garbled Circuit (GC) protocol. We break down the deep learning inference task into two phases: (i) privacy-insensitive (local) computation, and (ii) privacy-sensitive (interactive) computation. We devise a high-throughput and power-efficient implementation of GC protocol on FPGA for the privacy-sensitive phase. ReDCrypt's accompanying API provides support for seamless integration of ReDCrypt into any deep learning framework. Proof-of-concept evaluations for different DL applications demonstrate up to 57-fold higher throughput per core compared to the best prior solution with no drop in the accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yu:2018:IDC, author = "Jincheng Yu and Guangjun Ge and Yiming Hu and Xuefei Ning and Jiantao Qiu and Kaiyuan Guo and Yu Wang and Huazhong Yang", title = "Instruction Driven Cross-layer {CNN} Accelerator for Fast Detection on {FPGA}", journal = j-TRETS, volume = "11", number = "3", pages = "22:1--22:??", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3283452", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In recent years, Convolutional Neural Networks (CNNs) have been widely applied in computer vision and have achieved significant improvements in object detection tasks. Although there are many optimizing methods to speed up CNN-based detection algorithms, it is still difficult to deploy detection algorithms on real-time low-power systems. Field-Programmable Gate Array (FPGA) has been widely explored as a platform for accelerating CNN due to its promising performance, high energy efficiency, and flexibility. Previous works show that the energy consumption of CNN accelerators is dominated by the memory access. By fusing multiple layers in CNN, the intermediate data transfer can be reduced. However, previous accelerators with the cross-layer scheduling are designed for a particular CNN model. In addition to the memory access optimization, the Winograd algorithm can greatly improve the computational performance of convolution. In this article, to improve the flexibility of hardware, we design an instruction-driven CNN accelerator, supporting the Winograd algorithm and the cross-layer scheduling, for object detection. We modify the loop unrolling order of CNN, so that we can schedule a CNN across different layers with instructions and eliminate the intermediate data transfer. We propose a hardware architecture to support the instructions with Winograd computation units and reach the state-of-the-art energy efficiency. To deploy image detection algorithms onto the proposed accelerator with fixed-point computation units, we adopt the fixed-point fine-tune method, which can guarantee the accuracy of the detection algorithms. We evaluate our accelerator and scheduling policy on the Xilinx KU115 FPGA platform. The intermediate data transfer can be reduced by more than 90\% on the VGG-D CNN model with the cross-layer strategy. Thus, the performance of our hardware accelerator reaches 1700GOP/s on the classification model VGG-D. We also implement a framework for object detection algorithms, which achieves 2.3$ \times $ and 50$ \times $ in energy efficiency compared with GPU and CPU, respectively. Compared with floating-point algorithms, the accuracy of the fixed-point detection algorithms only drops by less than 1\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2018:EMP, author = "Wensong Li and Fan Yang and Hengliang Zhu and Xuan Zeng and Dian Zhou", title = "An Efficient Memory Partitioning Approach for Multi-Pattern Data Access via Data Reuse", journal = j-TRETS, volume = "12", number = "1", pages = "1:1--1:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3301296", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301296", abstract = "Memory bandwidth has become a bottleneck that impedes performance improvement during the parallelism optimization of the datapath. Memory partitioning is a practical approach to reduce bank-level conflicts and increase the bandwidth on a field-programmable gate array. In this work, we propose a memory partitioning approach for multi-pattern data access. First, we propose to combine multiple patterns into a single pattern to reduce the complexity of multi-pattern. Then, we propose to perform data reuse analysis on the combined pattern to find data reuse opportunities and the non-reusable data pattern. Finally, an efficient bank mapping algorithm with low complexity and low overhead is proposed to find the optimal memory partitioning solution. Experimental results demonstrated that compared to the state-of-the-art method, our proposed approach can reduce the number of block RAMS by 58.9\% on average, with 79.6\% reduction in SLICEs, 85.3\% reduction in LUTs, 67.9\% in reduction Flip-Flops, 54.6\% reduction in DSP48Es, 83.9\% reduction in SRLs, 50.0\% reduction in storage overhead, 95.0\% reduction in execution time, and 77.3\% reduction in dynamic power consumption on average. Meanwhile, the performance can be improved by 14.0\% on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Guo:2018:DSF, author = "Kaiyuan Guo and Shulin Zeng and Jincheng Yu and Yu Wang and Huazhong Yang", title = "{[DL]} A Survey of {FPGA}-based Neural Network Inference Accelerators", journal = j-TRETS, volume = "12", number = "1", pages = "2:1--2:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3289185", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3289185", abstract = "Recent research on neural networks has shown a significant advantage in machine learning over traditional algorithms based on handcrafted features and models. Neural networks are now widely adopted in regions like image, speech, and video recognition. But the high computation and storage complexity of neural network inference poses great difficulty on its application. It is difficult for CPU platforms to offer enough computation capacity. GPU platforms are the first choice for neural network processes because of its high computation capacity and easy-to-use development frameworks. However, FPGA-based neural network inference accelerator is becoming a research topic. With specifically designed hardware, FPGA is the next possible solution to surpass GPU in speed and energy efficiency. Various FPGA-based accelerator designs have been proposed with software and hardware optimization techniques to achieve high speed and energy efficiency. In this article, we give an overview of previous work on neural network inference accelerators based on FPGA and summarize the main techniques used. An investigation from software to hardware, from circuit level to system level is carried out to complete analysis of FPGA-based neural network inference accelerator design and serves as a guide to future work.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yazdanshenas:2018:CAM, author = "Sadegh Yazdanshenas and Vaughn Betz", title = "{COFFE 2}: Automatic Modelling and Optimization of Complex and Heterogeneous {FPGA} Architectures", journal = j-TRETS, volume = "12", number = "1", pages = "3:1--3:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3301298", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301298", abstract = "FPGAs are becoming more heterogeneous to better adapt to different markets, motivating rapid exploration of different blocks/tiles for FPGAs. To evaluate a new FPGA architectural idea, one should be able to accurately obtain the area, delay, and energy consumption of the block of interest. However, current FPGA circuit design tools can only model simple, homogeneous FPGA architectures with basic logic blocks and also lack DSP and other heterogeneous block support. Modern FPGAs are instead composed of many different tiles, some of which are designed in a full custom style and some of which mix standard cell and full custom styles. To fill this modelling gap, we introduce COFFE 2, an open-source FPGA design toolset for automatic FPGA circuit design. COFFE 2 uses a mix of full custom and standard cell flows and supports not only complex logic blocks with fracturable lookup tables and hard arithmetic but also arbitrary heterogeneous blocks. To validate COFFE 2 and demonstrate its features, we design and evaluate a multi-mode Stratix III-like DSP block and several logic tiles with fracturable LUTs and hard arithmetic. We also demonstrate how COFFE 2's interface to VTR allows full evaluation of block-routing interfaces and various fracturable 6-LUT architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Choi:2018:DAM, author = "Young-Kyu Choi and Jason Cong and Zhenman Fang and Yuchen Hao and Glenn Reinman and Peng Wei", title = "In-Depth Analysis on Microarchitectures of Modern Heterogeneous {CPU--FPGA} Platforms", journal = j-TRETS, volume = "12", number = "1", pages = "4:1--4:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3294054", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3294054", abstract = "Conventional homogeneous multicore processors are not able to provide the continued performance and energy improvement that we have expected from past endeavors. Heterogeneous architectures that feature specialized hardware accelerators are widely considered a promising paradigm for resolving this issue. Among different heterogeneous devices, FPGAs that can be reconfigured to accelerate a broad class of applications with orders-of-magnitude performance/watt gains, are attracting increased attention from both academia and industry. As a consequence, a variety of CPU--FPGA acceleration platforms with diversified microarchitectural features have been supplied by industry vendors. Such diversity, however, poses a serious challenge to application developers in selecting the appropriate platform for a specific application or application domain. This article aims to address this challenge by determining which microarchitectural characteristics affect performance, and in what ways. Specifically, we conduct a quantitative comparison and an in-depth analysis on five state-of-the-art CPU--FPGA acceleration platforms: (1) the Alpha Data board and (2) the Amazon F1 instance that represent the traditional PCIe-based platform with private device memory; (3) the IBM CAPI that represents the PCIe-based system with coherent shared memory; (4) the first generation of the Intel Xeon+FPGA Accelerator Platform that represents the QPI-based system with coherent shared memory; and (5) the second generation of the Intel Xeon+FPGA Accelerator Platform that represents a hybrid PCIe-based (non-coherent) and QPI-based (coherent) system with shared memory. Based on the analysis of their CPU--FPGA communication latency and bandwidth characteristics, we provide a series of insights for both application developers and platform designers. Furthermore, we conduct two case studies to demonstrate how these insights can be leveraged to optimize accelerator designs. The microbenchmarks used for evaluation have been released for public use.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cao:2018:FRA, author = "Shijie Cao and Lanshun Nie and Dechen Zhan and Wenqiang Wang and Ningyi Xu and Ramashis Das and Ming Wu and Lintao Zhang and Derek Chiou", title = "{FlexSaaS}: a Reconfigurable Accelerator for {Web} Search Selection", journal = j-TRETS, volume = "12", number = "1", pages = "5:1--5:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3301409", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301409", abstract = "Web search engines deploy large-scale selection services on CPUs to identify a set of web pages that match user queries. An FPGA-based accelerator can exploit various levels of parallelism and provide a lower latency, higher throughput, more energy-efficient solution than commodity CPUs. However, maintaining such a customized accelerator in a commercial search engine is challenging because selection services are changed often. This article presents our design for FlexSaaS (Flexible Selection as a Service), an FPGA-based accelerator for web search selection. To address efficiency and flexibility challenges, FlexSaaS abstracts computing models and separates memory access from computation. Specifically, FlexSaaS (i) contains a reconfigurable number of matching processors that can handle various possible query plans, (ii) decouples index stream reading from matching computation to fetch and decode index files, and (iii) includes a universal memory accessor that hides the complex memory hierarchy and reduces host data access latency. Evaluated on FPGAs in the selection service of a commercial web search--the Bing web search engine-FlexSaaS can be evolved quickly to adapt to new updates. Compared to the software baseline, FlexSaaS on Arria 10 reduces average latency by 30\% and increases throughput by 1.5$ \times $.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2019:PFF, author = "Gai Liu and Zhiru Zhang", title = "{PIMap}: a Flexible Framework for Improving {LUT}-Based Technology Mapping via Parallelized Iterative Optimization", journal = j-TRETS, volume = "11", number = "4", pages = "23:1--23:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3268344", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3268344", abstract = "Modern FPGA synthesis tools typically apply a predetermined sequence of logic optimizations on the input logic network before carrying out technology mapping. While the ``known recipes'' of logic transformations often lead to improved mapping results, there remains a nontrivial gap between the quality metrics driving the pre-mapping logic optimizations and those targeted by the actual technology mapping. Needless to mention, such miscorrelations would eventually result in suboptimal quality of results. In this article, we propose PIMap, which couples logic transformations and technology mapping under an iterative improvement framework for LUT-based FPGAs. In each iteration, PIMap randomly proposes a transformation on the given logic network from an ensemble of candidate optimizations; it then invokes technology mapping and makes use of the mapping result to determine the likelihood of accepting the proposed transformation. By adjusting the optimization objective and incorporating required time constraints during the iterative process, PIMap can flexibly optimize for different objectives including area minimization, delay optimization, and delay-constrained area reduction. To mitigate the runtime overhead, we further introduce parallelization techniques to decompose a large design into multiple smaller sub-netlists that can be optimized simultaneously. Experimental results show that PIMap achieves promising quality improvement over a set of commonly used benchmarks, including improving the majority of the best-known area and delay records for the EPFL benchmark suite.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wang:2019:FBA, author = "Haomiao Wang and Prabu Thiagaraj and Oliver Sinnen", title = "{FPGA}-based Acceleration of {FT} Convolution for Pulsar Search Using {OpenCL}", journal = j-TRETS, volume = "11", number = "4", pages = "24:1--24:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3268933", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3268933", abstract = "The Square Kilometre Array (SKA) project will be the world's largest radio telescope array. With its large number of antennas, the number of signals that need to be processed is dramatic. One important element of the SKA's Central Signal Processor package is pulsar search. This article focuses on the FPGA-based acceleration of the Frequency-Domain Acceleration Search module, which is a part of SKA pulsar search engine. In this module, the frequency-domain input signals have to be processed by 85 Finite Impulse response (FIR) filters within a short period of limitation and for thousands of input arrays. Because of the large scale of the input length and FIR filter size, even high-end FPGA devices cannot parallelise the task completely. We start by investigating both time-domain FIR filter (TDFIR) and frequency-domain FIR filter (FDFIR) to tackle this task. We applied the overlap-add algorithm to split the coefficient array of TDFIR and the overlap-save algorithm to split the input signals of FDFIR. To achieve fast prototyping design, we employed OpenCL, which is a high-level FPGA development technique. The performance and power consumption are evaluated using multiple FPGA devices simultaneously and compared with GPU results, which is achieved by porting FPGA-based OpenCL kernels. The experimental evaluation shows that the FDFIR solution is very competitive in terms of performance, with a clear energy consumption advantage over the GPU solution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kroh:2019:EFG, author = "Alexander Kroh and Oliver Diessel", title = "Efficient Fine-grained Processor-logic Interactions on the Cache-coherent {Zynq} Platform", journal = j-TRETS, volume = "11", number = "4", pages = "25:1--25:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3277506", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3277506", abstract = "The introduction of cache-coherent processor-logic interconnects in CPU--FPGA platforms promises low-latency communication between CPU and FPGA fabrics. This reduced latency improves the performance of heterogeneous systems implemented on such devices and gives rise to new software architectures that can better use the available hardware. Via an extended study accelerating the software task scheduler of a microkernel operating system, this article reports on the potential for accelerating applications that exhibit fine-grained interactions. In doing so, we evaluate the performance of direct and cache-coherent communication methods for applications that involve frequent, low-bandwidth transactions between CPU and programmable logic. In the specific case we studied, we found that replacing a highly optimised software implementation of the task scheduler with an FPGA-based scheduler reduces the cost of communication between two software threads by 5.5\%. We also found that, while hardware acceleration reduces cache footprint, we still observe execution time variability because of other non-deterministic features of the CPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dumpala:2019:LUE, author = "Naveen Kumar Dumpala and Shivukumar B. Patil and Daniel Holcomb and Russell Tessier", title = "Loop Unrolling for Energy Efficiency in Low-Cost Field-Programmable Gate Arrays", journal = j-TRETS, volume = "11", number = "4", pages = "26:1--26:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3289186", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3289186", abstract = "Field-programmable gate arrays (FPGAs) are used for a wide variety of computations in low-cost embedded systems. Although these systems often have modest performance constraints, their energy consumption must typically be limited. Many FPGA applications employ repetitive loops that cannot be straightforwardly split into parallel computations. Performing a loop sequentially generally requires high-speed clocks that consume considerable clock power and sometimes require clock generation using a phase-locked loop (PLL). Loop unrolling addresses the high-speed clock issue, but its use often leads to significant combinational glitch power. In this work, a computer-aided design (CAD) approach that unrolls loops for designs targeted to low-cost FPGAs is described. Our approach considers latency constraints in an effort to minimize energy consumption for loop-based computation. To reduce glitch power, a glitch-filtering approach is introduced that provides a balance between glitch reduction and design performance. Glitch-filter enable signals are generated and routed to the filters using resources best suited to the target FPGA. Our approach automatically inserts glitch filters and associated control logic into a design prior to processing with FPGA synthesis, place, and route tools. Our energy-saving loop-unrolling approach has been evaluated using five benchmarks often used in low-cost FPGAs. The energy-saving capabilities of the approach have been evaluated for an Intel Cyclone IV and a Xilinx Artix-7 FPGA using board-level power measurement. The use of unrolling and glitch filtering is shown to reduce energy by at least 65\% for an Artix-7 device and 50\% for a Cyclone IV device while meeting design latency constraints.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2019:EMN, author = "Deming Chen", title = "Editorial: a Message from the New {Editor-in-Chief}", journal = j-TRETS, volume = "12", number = "2", pages = "6:1--6:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3326451", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3326451", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6e", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Morcel:2019:FAC, author = "Raghid Morcel and Hazem Hajj and Mazen A. R. Saghir and Haitham Akkary and Hassan Artail and Rahul Khanna and Anil Keshavamurthy", title = "{FeatherNet}: an Accelerated Convolutional Neural Network Design for Resource-constrained {FPGAs}", journal = j-TRETS, volume = "12", number = "2", pages = "6:1--6:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3306202", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3306202", abstract = "Convolutional Neural Network (ConvNet or CNN) algorithms are characterized by a large number of model parameters and high computational complexity. These two requirements have made it challenging for implementations on resource-limited FPGAs. The challenges are magnified when considering designs for low-end FPGAs. While previous work has demonstrated successful ConvNet implementations with high-end FPGAs, this article presents a ConvNet accelerator design that enables the implementation of complex deep ConvNet architectures on resource-constrained FPGA platforms aimed at the IoT market. We call the design ``FeatherNet'' for its light resource utilization. The implementations are VHDL-based providing flexibility in design optimizations. As part of the design process, new methods are introduced to address several design challenges. The first method is a novel stride-aware graph-based method targeted at ConvNets that aims at achieving efficient signal processing with reduced resource utilization. The second method addresses the challenge of determining the minimal precision arithmetic needed while preserving high accuracy. For this challenge, we propose variable-width dynamic fixed-point representations combined with a layer-by-layer design-space pruning heuristic across the different layers of the deep ConvNet model. The third method aims at achieving a modular design that can support different types of ConvNet layers while ensuring low resource utilization. For this challenge, we propose the modules to be relatively small and composed of computational filters that can be interconnected to build an entire accelerator design. These model elements can be easily configured through HDL parameters (e.g., layer type, mask size, stride, etc.) to meet the needs of specific ConvNet implementations and thus they can be reused to implement a wide variety of ConvNet architectures. The fourth method addresses the challenge of design portability between two different FPGA vendor platforms, namely, Intel/Altera and Xilinx. For this challenge, we propose to instantiate the device-specific hardware blocks needed in each computational filter, rather than relying on the synthesis tools to infer these blocks, while keeping track of the similarities and differences between the two platforms. We believe that the solutions to these design challenges further advance knowledge as they can benefit designers and other researchers using similar devices or facing similar challenges. Our results demonstrated the success of addressing the design challenges and achieving low (30\%) resource utilization for the low-end FPGA platforms: Zedboard and Cyclone V. The design overcame the limitation of designs targeted for high-end platforms and that cannot fit on low-end IoT platforms. Furthermore, our design showed superior performance results (measured in terms of [Frame/s/W] per Dollar) compared to high-end optimized designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhou:2019:FAN, author = "Xuegong Zhou and Lingli Wang and Alan Mishchenko", title = "Fast Adjustable {NPN} Classification Using Generalized Symmetries", journal = j-TRETS, volume = "12", number = "2", pages = "7:1--7:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3313917", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3313917", abstract = "NPN classification of Boolean functions is a powerful technique used in many logic synthesis and technology mapping tools in both standard cell and FPGA design flows. Computing the canonical form is the most common approach of Boolean function classification. This article proposes two different hybrid NPN canonical forms and a new algorithm to compute them. By exploiting symmetries under different phase assignment as well as higher-order symmetries, the search space of NPN canonical form computation is pruned and the runtime is dramatically reduced. Nevertheless, the runtime for some difficult functions remains high. Fast heuristic method can be used for such functions to compute semi-canonical forms in a reasonable time. The proposed algorithm can be adjusted to be a slow exact algorithm or a fast heuristic algorithm with lower quality. For exact NPN classification, the proposed algorithm is 40$ \times $ faster than state-of-the-art. For heuristic classification, the proposed algorithm has similar performance as state-of-the-art with a possibility to trade runtime for quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Oppermann:2019:EPM, author = "Julian Oppermann and Melanie Reuter-Oppermann and Lukas Sommer and Andreas Koch and Oliver Sinnen", title = "Exact and Practical Modulo Scheduling for High-Level Synthesis", journal = j-TRETS, volume = "12", number = "2", pages = "8:1--8:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3317670", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3317670", abstract = "Loop pipelining is an essential technique in high-level synthesis to increase the throughput and resource utilisation of field-programmable gate array--based accelerators. It relies on modulo schedulers to compute an operator schedule that allows subsequent loop iterations to overlap partially when executed while still honouring all precedence and resource constraints. Modulo schedulers face a bi-criteria problem: minimise the initiation interval (II; i.e., the number of timesteps after which new iterations are started) and minimise the schedule length. We present Moovac, a novel exact formulation that models all aspects (including the II minimisation) of the modulo scheduling problem as a single integer linear program, and discuss simple measures to prevent excessive runtimes, to challenge the old preconception that exact modulo scheduling is impractical. We substantiate this claim by conducting an experimental study covering 188 loops from two established high-level synthesis benchmark suites, four different time limits, and three bounds for the schedule length, to compare our approach against a highly tuned exact formulation and a state-of-the-art heuristic algorithm. In the fastest configuration, an accumulated runtime of under 16 minutes is spent on scheduling all loops, and proven optimal IIs are found for 179 test instances.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bo:2019:APR, author = "Chunkun Bo and Vinh Dang and Ted Xie and Jack Wadden and Mircea Stan and Kevin Skadron", title = "Automata Processing in Reconfigurable Architectures: In-the-Cloud Deployment, Cross-Platform Evaluation, and Fast Symbol-Only Reconfiguration", journal = j-TRETS, volume = "12", number = "2", pages = "9:1--9:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3314576", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3314576", abstract = "We present a general automata processing framework on FPGAs, which generates an RTL kernel for automata processing together with an AXI and PCIe based I/O circuitry. We implement the framework on both local nodes and cloud platforms (Amazon AWS and Nimbix) with novel features. A full performance comparison of the proposed framework is conducted against state-of-the-art automata processing engines on CPUs, GPUs, and Micron's Automata Processor using the ANMLZoo benchmark suite and some real-world datasets. Results show that FPGAs enable extremely high-throughput automata processing compared to von Neumann architectures. We also collect the resource utilization and power consumption on the two cloud platforms, and find that the I/O circuitry consumes most of the hardware resources and power. Furthermore, we propose a fast, symbol-only reconfiguration mechanism based on the framework for large pattern sets that cannot fit on a single device and need to be partitioned. The proposed method supports multiple passes of the input stream and reduces the re-compilation cost from hours to seconds.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dinh:2019:NFI, author = "Van Luan Dinh and Xuan Truong Nguyen and Hyuk-Jae Lee", title = "A Novel {FPGA} Implementation of a Time-to-Digital Converter Supporting Run-Time Estimation and Compensation", journal = j-TRETS, volume = "12", number = "2", pages = "10:1--10:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3322482", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3322482", abstract = "Time-to-digital converters (TDCs) are widely used in applications that require the measurement of the time interval between events. In previous designs using a feedback loop and an extended delay line, process-voltage-temperature (PVT) variation often decreases the accuracy of measurements. To overcome the loss of accuracy caused by PVT variation, this study proposes a novel design of a synthesizable TDC that employs run-time estimation and compensation of PVT variation. A delay line consisting of a series of buffers is used to detect the period of a ring oscillator designed to measure the time interval between two events. By comparing the detected period and the system clock, the variation of the oscillation period is compensated at run-time. The proposed TDC is successfully implemented by using a low-cost Xilinx Spartan-6 LX9 FPGA with a 50-MHz oscillator. Experimental results show that the proposed TDC is robust to PVT variation with a resolution of 19.1 ps. In comparison with previous design, the proposed TDC achieves about five times better tradeoff in the area, resolution, and frequency of the reference clock.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bobda:2019:ISS, author = "Chistophe Bobda and Ken Eguro", title = "Introduction to the Special Section on Security in {FPGA}-accelerated Cloud and Datacenters", journal = j-TRETS, volume = "12", number = "3", pages = "11:1--11:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3352060", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3352060", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11e", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Giechaskiel:2019:LWE, author = "Ilias Giechaskiel and Ken Eguro and Kasper B. Rasmussen", title = "Leakier Wires: Exploiting {FPGA} Long Wires for Covert- and Side-channel Attacks", journal = j-TRETS, volume = "12", number = "3", pages = "11:1--11:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3322483", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3322483", abstract = "In complex FPGA designs, implementations of algorithms and protocols from third-party sources are common. However, the monolithic nature of FPGAs means that all sub-circuits share common on-chip infrastructure, such as routing resources. This presents an attack vector for all FPGAs that contain designs from multiple vendors, especially for FPGAs used in multi-tenant cloud environments, or integrated into multi-core processors. In this article, we show that ``long'' routing wires present a new source of information leakage on FPGAs, by influencing the delay of adjacent long wires. We show that the effect is measurable for both static and dynamic signals and that it can be detected using small on-board circuits. We characterize the channel in detail and show that it is measurable even when multiple competing circuits (including multiple long-wire transmitters) are present and can be replicated on different generations and families of Xilinx devices (Virtex 5, Virtex 6, Artix 7, and Spartan 7). We exploit the leakage to create a covert channel with 6kbps of bandwidth and 99.9\% accuracy, and a side channel, which can recover signals kept constant for only 1.3s $ \mu $ s, with an accuracy of more than 98.4\%. Finally, we propose countermeasures to reduce the impact of this leakage.$^1$", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Krautter:2019:MEL, author = "Jonas Krautter and Dennis R. E. Gnad and Mehdi B. Tahoori", title = "Mitigating Electrical-level Attacks towards Secure Multi-Tenant {FPGAs} in the Cloud", journal = j-TRETS, volume = "12", number = "3", pages = "12:1--12:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328222", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3328222", abstract = "A rising trend is the use of multi-tenant FPGAs, particularly in cloud environments, where partial access to the hardware is given to multiple third parties. This leads to new types of attacks in FPGAs, which operate not only on the logic level, but also on the electrical level through the common power delivery network. Since FPGAs are configured from the software-side, attackers are enabled to launch hardware attacks from software, impacting the security of an entire system. In this article, we show the first attempt of a countermeasure against attacks on the electrical level, which is based on a bitstream checking methodology. Bitstreams are translated back into flat technology mapped netlists, which are then checked for properties that indicate potential malicious runtime behavior of FPGA logic. Our approach can provide a metric of potential risk of the FPGA bitstream being used in active fault or passive side-channel attacks against other users of the FPGA fabric or the entire SoC platform.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Elrabaa:2019:PPP, author = "Muhammad E. S. Elrabaa and Mohamed A. Al-Asli and Marwan H. Abu-Amara", title = "A Protection and Pay-per-use Licensing Scheme for On-cloud {FPGA} Circuit {IPs}", journal = j-TRETS, volume = "12", number = "3", pages = "13:1--13:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3329861", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3329861", abstract = "Using security primitives, a novel scheme for licensing hardware intellectual properties (HWIPs) on Field Programmable Gate Arrays (FPGAs) in public clouds is proposed. The proposed scheme enforces a pay-per-use model, allows HWIP's installation only on specific on-cloud FPGAs, and efficiently protects the HWIPs from being cloned, reverse engineered, or used without the owner's authorization by any party, including a cloud insider. It also provides protection for the users' designs integrated with the HWIP on the same FPGA. This enables cloud tenants to license HWIPs in the cloud from the HWIP vendors at a relatively low price based on usage instead of paying the expensive unlimited HWIP license fee. The scheme includes a protocol for FPGA authentication, HWIP secure decryption, and usage by the clients without the need for the HWIP vendor to be involved or divulge their secret keys. A complete prototype test-bed implementation showed that the proposed scheme is very feasible with relatively low resource utilization. Experiments also showed that a HWIP could be licensed and set up in the on-cloud FPGA in 0.9s. This is 15 times faster than setting up the same HWIP from outside the cloud, which takes about 14s based on the average global Internet speed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2019:RAD, author = "Jiliang Zhang and Gang Qu", title = "Recent Attacks and Defenses on {FPGA}-based Systems", journal = j-TRETS, volume = "12", number = "3", pages = "14:1--14:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3340557", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3340557", abstract = "Field-programmable gate array (FPGA) is a kind of programmable chip that is widely used in many areas, including automotive electronics, medical devices, military and consumer electronics, and is gaining more popularity. Unlike the application specific integrated circuits (ASIC) design, an FPGA-based system has its own supply-chain model and design flow, which brings interesting security and trust challenges. In this survey, we review the security and trust issues related to FPGA-based systems from the market perspective, where we model the market with the following parties: FPGA vendors, foundries, IP vendors, EDA tool vendors, FPGA-based system developers, and end-users. For each party, we show the security and trust problems they need to be aware of and the associated solutions that are available. We also discuss some challenges and opportunities in the security and trust of FPGA-based systems used in large-scale cloud and datacenters.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Umuroglu:2019:OBS, author = "Yaman Umuroglu and Davide Conficconi and Lahiru Rasnayake and Thomas B. Preusser and Magnus Sj{\"a}lander", title = "Optimizing Bit-Serial Matrix Multiplication for Reconfigurable Computing", journal = j-TRETS, volume = "12", number = "3", pages = "15:1--15:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3337929", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3337929", abstract = "Matrix--matrix multiplication is a key computational kernel for numerous applications in science and engineering, with ample parallelism and data locality that lends itself well to high-performance implementations. Many matrix multiplication-dependent applications can use reduced-precision integer or fixed-point representations to increase their performance and energy efficiency while still offering adequate quality of results. However, precision requirements may vary between different application phases or depend on input data, rendering constant-precision solutions ineffective. BISMO, a vectorized bit-serial matrix multiplication overlay for reconfigurable computing, previously utilized the excellent binary-operation performance of FPGAs to offer a matrix multiplication performance that scales with required precision and parallelism. We show how BISMO can be scaled up on Xilinx FPGAs using an arithmetic architecture that better utilizes six-input LUTs. The improved BISMO achieves a peak performance of 15.4 binary TOPS on the Ultra96 board with a Xilinx UltraScale+ MPSoC.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Al-Hyari:2019:NCE, author = "Abeer Al-Hyari and Ziad Abuowaimer and Timothy Martin and Gary Gr{\'e}wal and Shawki Areibi and Anthony Vannelli", title = "Novel Congestion-estimation and Routability-prediction Methods based on Machine Learning for Modern {FPGAs}", journal = j-TRETS, volume = "12", number = "3", pages = "16:1--16:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3337930", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3337930", abstract = "Effectively estimating and managing congestion during placement can save substantial placement and routing runtime. In this article, we present a machine-learning model for accurately and efficiently estimating congestion during FPGA placement. Compared with the state-of-the-art machine-learning congestion-estimation model, our results show a 25\% improvement in prediction accuracy. This makes our model competitive with congestion estimates produced using a global router. However, our model runs, on average, 291$ \times $ faster than the global router. Overall, we are able to reduce placement runtimes by 17\% and router runtimes by 19\%. An additional machine-learning model is also presented that uses the output of the first congestion-estimation model to determine whether or not a placement is routable. This second model has an accuracy in the range of 93\% to 98\%, depending on the classification algorithm used to implement the learning model, and runtimes of a few milliseconds, thus making it suitable for inclusion in any placer with no worry of additional computational overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Owaida:2019:DID, author = "Muhsen Owaida and Amit Kulkarni and Gustavo Alonso", title = "Distributed Inference over Decision Tree Ensembles on Clusters of {FPGAs}", journal = j-TRETS, volume = "12", number = "4", pages = "17:1--17:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3340263", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3340263", abstract = "Given the growth in data inputs and application complexity, it is often the case that a single hardware accelerator is not enough to solve a given problem. In particular, the computational demands and I/O of many tasks in machine learning often require a cluster of accelerators to make a relevant difference in performance. In this article, we explore the efficient construction of FPGA clusters using inference over Decision Tree Ensembles as the target application. The article explores several levels of the problem: (1) a lightweight inter-FPGA communication protocol and routing layer to facilitate the communication between the different FPGAs, (2) the data partitioning and distribution strategies maximizing performance, (3) and an in depth analysis on how applications can be efficiently distributed over such a cluster. The experimental analysis shows that the resulting system can support inference over decision tree ensembles at a significantly higher throughput than that achieved by existing systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ahmed:2019:FAB, author = "Ibrahim Ahmed and Shuze Zhao and James Meijers and Olivier Trescases and Vaughn Betz", title = "{FRoC 2.0}: Automatic {BRAM} and Logic Testing to Enable Dynamic Voltage Scaling for {FPGA} Applications", journal = j-TRETS, volume = "12", number = "4", pages = "20:1--20:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3354188", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3354188", abstract = "In earlier technology nodes, FPGAs had low power consumption compared to other compute chips such as CPUs and GPUs. However, in the 14nm technology node, FPGAs are consuming unprecedented power in the 100+W range, making power consumption a pressing concern. To reduce FPGA power consumption, several researchers have proposed deploying dynamic voltage scaling. While the previously proposed solutions show promising results, they have difficulty guaranteeing safe operation at reduced voltages for applications that use the FPGA hard blocks. In this work, we present the first DVS solution that is able to fully handle FPGA applications that use BRAMs. Our solution not only robustly tests the soft logic component of the application but also tests all components connected to the BRAMs. We extend a previously proposed CAD tool, FRoC, to automatically generate calibration bitstreams that are used to measure the application's critical path delays on silicon. The calibration bitstreams also include testers that ensure all used SRAM cells operate safely while scaling V$_{dd}$. We experimentally show that using our DVS solution we can save 32\% of the total power consumed by a discrete Fourier transform application running with the fixed nominal supply voltage and clocked at the F$_{max}$ reported by static timing analysis.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tridgell:2019:UTN, author = "Stephen Tridgell and Martin Kumm and Martin Hardieck and David Boland and Duncan Moss and Peter Zipf and Philip H. W. Leong", title = "Unrolling Ternary Neural Networks", journal = j-TRETS, volume = "12", number = "4", pages = "22:1--22:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3359983", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Oct 19 17:43:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The computational complexity of neural networks for large-scale or real-time applications necessitates hardware acceleration. Most approaches assume that the network architecture and parameters are unknown at design time, permitting usage in a large number of applications. This article demonstrates, for the case where the neural network architecture and ternary weight values are known a priori, that extremely high throughput implementations of neural network inference can be made by customising the datapath and routing to remove unnecessary computations and data movement. This approach is ideally suited to FPGA implementations as a specialized implementation of a trained network improves efficiency while still retaining generality with the reconfigurability of an FPGA. A VGG-style network with ternary weights and fixed point activations is implemented for the CIFAR10 dataset on Amazon's AWS F1 instance. This article demonstrates how to remove 90\% of the operations in convolutional layers by exploiting sparsity and compile-time optimizations. The implementation in hardware achieves 90.9 \pm 0.1\% accuracy and 122k frames per second, with a latency of only 29\micro s, which is the fastest CNN inference implementation reported so far on an FPGA.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Serre:2020:DBH, author = "Fran{\c{c}}ois Serre and Markus P{\"u}schel", title = "{DSL}-Based Hardware Generation with {Scala}: Example {Fast Fourier Transforms} and Sorting Networks", journal = j-TRETS, volume = "13", number = "1", pages = "1:1--1:23", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3359754", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Feb 6 08:37:52 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3359754", abstract = "We present a hardware generator for computations with regular structure including the fast Fourier transform (FFT), sorting networks, and others. The input of the generator is a high-level description of the algorithm; the output is a token-based, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Alachiotis:2020:RXF, author = "Nikolaos Alachiotis and Charalampos Vatsolakis and Grigorios Chrysos and Dionisios Pnevmatikatos", title = "{RAiSD-X}: a Fast and Accurate {FPGA} System for the Detection of Positive Selection in Thousands of Genomes", journal = j-TRETS, volume = "13", number = "1", pages = "2:1--2:30", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3364225", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Feb 6 08:37:52 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3364225", abstract = "Detecting traces of positive selection in genomes carries theoretical significance and has practical applications from shedding light on the forces that drive adaptive evolution to the design of more effective drug treatments. The size of genomic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Attia:2020:FFI, author = "Sameh Attia and Vaughn Betz", title = "Feel Free to Interrupt: Safe Task Stopping to Enable {FPGA} Checkpointing and Context Switching", journal = j-TRETS, volume = "13", number = "1", pages = "3:1--3:27", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372491", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Feb 6 08:37:52 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372491", abstract = "Saving and restoring an FPGA task state in an orderly manner is essential to enable hardware checkpointing, which is highly desirable to improve the ability to debug cloud-scale hardware services, and context switching, which allows multiple users to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jamal:2020:FTH, author = "Al-Shahna Jamal and Eli Cahill and Jeffrey Goeders and Steven J. E. Wilton", title = "Fast Turnaround {HLS} Debugging Using Dependency Analysis and Debug Overlays", journal = j-TRETS, volume = "13", number = "1", pages = "4:1--4:26", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372490", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Feb 6 08:37:52 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372490", abstract = "High-level synthesis (HLS) has gained considerable traction over recent years, as it allows for faster development and verification of hardware accelerators than traditional RTL design. While HLS allows for most bugs to be caught during software \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kourfali:2020:CDD, author = "Alexandra Kourfali and Dirk Stroobandt", title = "In-Circuit Debugging with Dynamic Reconfiguration of {FPGA} Interconnects", journal = j-TRETS, volume = "13", number = "1", pages = "5:1--5:29", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3375459", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Feb 6 08:37:52 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3375459", abstract = "In this work, a novel method for in-circuit debugging on FPGAs is introduced that allows the insertion of low-overhead debugging infrastructure by exploiting the technique of parameterized configurations. This allows the parameterization of the LUTs and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Garg:2020:HNC, author = "Tushar Garg and Saud Wasly and Rodolfo Pellizzoni and Nachiket Kapre", title = "{HopliteBuf}: Network Calculus-Based Design of {FPGA NoCs} with Provably Stall-Free {FIFOs}", journal = j-TRETS, volume = "13", number = "2", pages = "6:1--6:35", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3375899", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jun 11 15:19:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3375899", abstract = "HopliteBuf is a deflection-free, low-cost, and high-speed FPGA overlay Network-on-chip (NoC) with stall-free buffers. It is an FPGA-friendly 2D unidirectional torus topology built on top of HopliteRT overlay NoC. The stall-free buffers in HopliteBuf are \ldots{}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fraser:2020:KNL, author = "Nicholas J. Fraser and Philip H. W. Leong", title = "Kernel Normalised Least Mean Squares with Delayed Model Adaptation", journal = j-TRETS, volume = "13", number = "2", pages = "7:1--7:30", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3376924", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jun 11 15:19:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3376924", abstract = "Kernel adaptive filters (KAFs) are non-linear filters which can adapt temporally and have the additional benefit of being computationally efficient through use of the ``kernel trick''. In a number of real-world applications, such as channel equalisation, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Besta:2020:SCM, author = "Maciej Besta and Marc Fischer and Tal Ben-Nun and Dimitri Stanojevic and Johannes De Fine Licht and Torsten Hoefler", title = "Substream-Centric Maximum Matchings on {FPGA}", journal = j-TRETS, volume = "13", number = "2", pages = "8:1--8:33", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377871", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jun 11 15:19:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377871", abstract = "Developing high-performance and energy-efficient algorithms for maximum matchings is becoming increasingly important in social network analysis, computational sciences, scheduling, and others. In this work, we propose the first maximum matching \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Murray:2020:VHP, author = "Kevin E. Murray and Oleg Petelin and Sheng Zhong and Jia Min Wang and Mohamed Eldafrawy and Jean-Philippe Legault and Eugene Sha and Aaron G. Graham and Jean Wu and Matthew J. P. Walker and Hanqing Zeng and Panagiotis Patros and Jason Luu and Kenneth B. Kent and Vaughn Betz", title = "{VTR 8}: High-performance {CAD} and Customizable {FPGA} Architecture Modelling", journal = j-TRETS, volume = "13", number = "2", pages = "9:1--9:55", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388617", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jun 11 15:19:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3388617", abstract = "Developing Field-programmable Gate Array (FPGA) architectures is challenging due to the competing requirements of various application domains and changing manufacturing process technology. This is compounded by the difficulty of fairly evaluating FPGA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Delomier:2020:MBD, author = "Yann Delomier and Bertrand {Le Gal} and Jer{\'e}mie Crenne and Christophe Jego", title = "Model-based Design of Hardware {SC} Polar Decoders for {FPGAs}", journal = j-TRETS, volume = "13", number = "2", pages = "10:1--10:27", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3391431", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Thu Jun 11 15:19:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3391431", abstract = "Polar codes are a new error correction code family that should be benchmarked and evaluated in comparison to LDPC and turbo-codes. Indeed, recent advances in the 5G digital communication standard recommended the use of polar codes in EMBB control \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shao:2020:PGF, author = "Zhiyuan Shao and Chenhao Liu and Ruoshi Li and Xiaofei Liao and Hai Jin", title = "Processing Grid-format Real-world Graphs on {DRAM}-based {FPGA} Accelerators with Application-specific Caching Mechanisms", journal = j-TRETS, volume = "13", number = "3", pages = "11:1--11:33", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3391920", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3391920", abstract = "Graph processing is one of the important research topics in the big-data era. To build a general framework for graph processing by using a DRAM-based FPGA board with deep memory hierarchy, one of the reasonable methods is to partition a given big graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Eldafrawy:2020:FLB, author = "Mohamed Eldafrawy and Andrew Boutros and Sadegh Yazdanshenas and Vaughn Betz", title = "{FPGA} Logic Block Architectures for Efficient Deep Learning Inference", journal = j-TRETS, volume = "13", number = "3", pages = "12:1--12:34", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3393668", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3393668", abstract = "Reducing the precision of deep neural network (DNN) inference accelerators can yield large efficiency gains with little or no accuracy degradation compared to half or single precision floating-point by enabling more multiplication operations per unit \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mu:2020:OOB, author = "Jiandong Mu and Wei Zhang and Hao Liang and Sharad Sinha", title = "Optimizing {OpenCL}-Based {CNN} Design on {FPGA} with Comprehensive Design Space Exploration and Collaborative Performance Modeling", journal = j-TRETS, volume = "13", number = "3", pages = "13:1--13:28", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397514", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3397514", abstract = "Recent success in applying convolutional neural networks (CNNs) to object detection and classification has sparked great interest in accelerating CNNs using hardware-like field-programmable gate arrays (FPGAs). However, finding an efficient FPGA design \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sabogal:2020:RFE, author = "Sebastian Sabogal and Alan George and Christopher Wilson", title = "Reconfigurable Framework for Environmentally Adaptive Resilience in Hybrid Space Systems", journal = j-TRETS, volume = "13", number = "3", pages = "14:1--14:32", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3398380", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3398380", abstract = "Due to ongoing innovations in both sensor technology and spacecraft autonomy, onboard space processing continues to be outpaced by the escalating computational demands required for next-generation missions. Commercial-off-the-shelf, hybrid system-on-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{La:2020:FMS, author = "Tuan Minh La and Kaspar Matas and Nikola Grunchevski and Khoa Dang Pham and Dirk Koch", title = "{FPGADefender}: Malicious Self-oscillator Scanning for {Xilinx UltraScale} + {FPGAs}", journal = j-TRETS, volume = "13", number = "3", pages = "15:1--15:31", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3402937", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3402937", abstract = "Sharing configuration bitstreams rather than netlists is a very desirable feature to protect IP or to share IP without longer CAD tool processing times. Furthermore, an increasing number of systems could hugely benefit from serving multiple users on the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tang:2020:PSM, author = "Qi Tang and Zhe Wang and Biao Guo and Li-Hua Zhu and Ji-Bo Wei", title = "Partitioning and Scheduling with Module Merging on Dynamic Partial Reconfigurable {FPGAs}", journal = j-TRETS, volume = "13", number = "3", pages = "16:1--16:24", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3403702", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Sep 5 18:51:36 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3403702", abstract = "Field programmable gate array (FPGA) is ubiquitous nowadays and is applied to many areas. Dynamic partial reconfiguration (DPR) is introduced to most modern FPGAs, enabling changing the function of a part of the FPGA by dynamically loading new \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dehon:2020:ISS, author = "Andr{\'e} Dehon", title = "Introduction to Special Section on {FCCM 2019}", journal = j-TRETS, volume = "13", number = "4", pages = "17:1--17:2", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3410373", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 2 07:58:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3410373", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhou:2020:AFR, author = "Yun Zhou and Dries Vercruyce and Dirk Stroobandt", title = "Accelerating {FPGA} Routing Through Algorithmic Enhancements and Connection-aware Parallelization", journal = j-TRETS, volume = "13", number = "4", pages = "18:1--18:26", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3406959", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 2 07:58:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3406959", abstract = "Routing is a crucial step in Field Programmable Gate Array (FPGA) physical design, as it determines the routes of signals in the circuit, which impacts the design implementation quality significantly. It can be very time-consuming to successfully route \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2020:MRB, author = "Jialiang Zhang and Yue Zha and Nicholas Beckwith and Bangya Liu and Jing Li", title = "{MEG}: a {RISCV}-based System Emulation Infrastructure for Near-data Processing Using {FPGAs} and High-bandwidth Memory", journal = j-TRETS, volume = "13", number = "4", pages = "19:1--19:24", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409114", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 2 07:58:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3409114", abstract = "Emerging three-dimensional (3D) memory technologies, such as the Hybrid Memory Cube (HMC) and High Bandwidth Memory (HBM), provide high-bandwidth and massive memory-level parallelism. With the growing heterogeneity and complexity of computer systems \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vaishnav:2020:FMF, author = "Anuj Vaishnav and Khoa Dang Pham and Joseph Powell and Dirk Koch", title = "{FOS}: a Modular {FPGA} Operating System for Dynamic Workloads", journal = j-TRETS, volume = "13", number = "4", pages = "20:1--20:28", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3405794", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 2 07:58:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3405794", abstract = "With FPGAs now being deployed in the cloud and at the edge, there is a need for scalable design methods that can incorporate the heterogeneity present in the hardware and software components of FPGA systems. Moreover, these FPGA systems need to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ioannou:2020:UNA, author = "Aggelos D. Ioannou and Konstantinos Georgopoulos and Pavlos Malakonakis and Dionisios N. Pnevmatikatos and Vassilis D. Papaefstathiou and Ioannis Papaefstathiou and Iakovos Mavroidis", title = "{UNILOGIC}: a Novel Architecture for Highly Parallel Reconfigurable Systems", journal = j-TRETS, volume = "13", number = "4", pages = "21:1--21:32", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409115", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Oct 2 07:58:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3409115", abstract = "One of the main characteristics of High-performance Computing (HPC) applications is that they become increasingly performance and power demanding, pushing HPC systems to their limits. Existing HPC systems have not yet reached exascale performance mainly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2021:CHP, author = "Xuzhi Zhang and Xiaozhe Shao and George Provelengios and Naveen Kumar Dumpala and Lixin Gao and Russell Tessier", title = "{CoNFV}: a Heterogeneous Platform for Scalable Network Function Virtualization", journal = j-TRETS, volume = "14", number = "1", pages = "1:1--1:29", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3409113", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:04 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", URL = "https://dl.acm.org/doi/10.1145/3409113", abstract = "Network function virtualization (NFV) is a powerful networking approach that leverages computing resources to perform a time-varying set of network processing functions. Although microprocessors can be used for this purpose, their performance limitations and lack of specialization present implementation challenges. In this article, we describe a new heterogeneous hardware-software NFV platform called CoNFV that provides scalability and programmability while supporting significant hardware-level parallelism and reconfiguration. Our computing platform takes advantage of both field-programmable gate arrays (FPGAs) and microprocessors to implement numerous virtual network functions (VNF) that can be dynamically customized to specific network flow needs. The most distinctive feature of our system is the use of global network state to coordinate NFV operations. Traffic management and hardware reconfiguration functions are performed by a global coordinator that allows for the rapid sharing of network function states and continuous evaluation of network function needs. With the help of state sharing mechanism offered by the coordinator, customer-defined VNF instances can be easily migrated between heterogeneous middleboxes as the network environment changes. A resource allocation and scheduling algorithm dynamically assesses resource deployments as network flows and conditions are updated. We show that our deployment algorithm can successfully reallocate FPGA and microprocessor resources in a fraction of a second in response to changes in network flow capacity and network security threats including intrusion.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Beasley:2021:OCH, author = "Alexander E. Beasley and C. T. Clarke and R. J. Watson", title = "An {OpenGL} Compliant Hardware Implementation of a Graphic Processing Unit Using Field Programmable Gate Array-System on Chip Technology", journal = j-TRETS, volume = "14", number = "1", pages = "2:1--2:24", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410357", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:04 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3410357", abstract = "FPGA-SoC technology provides a heterogeneous platform for advanced, high-performance systems. The System on Chip (SoC) architecture combines traditional single and multiple core processor topologies with flexible FPGA fabric. Dynamic reconfiguration \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kara:2021:PGC, author = "Kaan Kara and Gustavo Alonso", title = "{PipeArch}: Generic and Context-Switch Capable Data Processing on {FPGAs}", journal = j-TRETS, volume = "14", number = "1", pages = "3:1--3:28", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418465", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:04 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3418465", abstract = "Data processing systems based on FPGAs offer high performance and energy efficiency for a variety of applications. However, these advantages are achieved through highly specialized designs. The high degree of specialization leads to accelerators with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mohajer:2021:PUC, author = "Soheil Mohajer and Zhiheng Wang and Kia Bazargan and Yuyang Li", title = "Parallel Unary Computing Based on Function Derivatives", journal = j-TRETS, volume = "14", number = "1", pages = "4:1--4:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418464", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:04 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3418464", abstract = "The binary number representation has dominated digital logic for decades due to its compact storage requirements. An alternative representation is the unary number system: We use N bits, from which the first M are 1 and the rest are 0 to represent the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kyparissas:2021:LSC, author = "Nikolaos Kyparissas and Apostolos Dollas", title = "Large-scale Cellular Automata on {FPGAs}: a New Generic Architecture and a Framework", journal = j-TRETS, volume = "14", number = "1", pages = "5:1--5:32", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3423185", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:04 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3423185", abstract = "Cellular automata (CA) are discrete mathematical models discovered in the 1940s by John von Neumann and Stanislaw Ulam and have been used extensively in many scientific disciplines ever since. The present work evolved from a Field Programmable Gate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Peetermans:2021:DAC, author = "Adriaan Peetermans and Vladimir Rozi{\'c} and Ingrid Verbauwhede", title = "Design and Analysis of Configurable Ring Oscillators for True Random Number Generation Based on Coherent Sampling", journal = j-TRETS, volume = "14", number = "2", pages = "7:1--7:20", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433166", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:05 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3433166", abstract = "True Random Number Generators (TRNGs) are indispensable in modern cryptosystems. Unfortunately, to guarantee high entropy of the generated numbers, many TRNG designs require a complex implementation procedure, often involving manual placement and routing. In this work, we introduce, analyse, and compare three dynamic calibration mechanisms for the COherent Sampling ring Oscillator based TRNG: GateVar, WireVar, and LUTVar, enabling easy integration of the entropy source into complex systems. The TRNG setup procedure automatically selects a configuration that guarantees the security requirements. In the experiments, we show that two out of the three proposed mechanisms are capable of assuring correct TRNG operation even when an automatic placement is carried out and when the design is ported to another Field-Programmable Gate Array (FPGA) family. We generated random bits on both a Xilinx Spartan 7 and a Microsemi SmartFusion2 implementation that, without post processing, passed the AIS-31 statistical tests at a throughput of 4.65 Mbit/s and 1.47 Mbit/s, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cho:2021:PMC, author = "Shenghsun Cho and Mrunal Patel and Michael Ferdman and Peter Milder", title = "Practical Model Checking on {FPGAs}", journal = j-TRETS, volume = "14", number = "2", pages = "8:1--8:18", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3448272", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:05 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3448272", abstract = "Software verification is an important stage of the software development process, particularly for mission-critical systems. As the traditional methodology of using unit tests falls short of verifying complex software, developers are increasingly relying \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ma:2021:SFP, author = "Rui Ma and Jia-Ching Hsu and Tian Tan and Eriko Nurvitadhi and David Sheffield and Rob Pelt and Martin Langhammer and Jaewoong Sim and Aravind Dasu and Derek Chiou", title = "Specializing {FGPU} for Persistent Deep Learning", journal = j-TRETS, volume = "14", number = "2", pages = "10:1--10:23", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457886", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:05 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3457886", abstract = "Overlay architectures are a good way to enable fast development and debug on FPGAs at the expense of potentially limited performance compared to fully customized FPGA designs. When used in concert with hand-tuned FPGA solutions, performant overlay \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhou:2021:SHC, author = "Zhen Zhou and Debiao He and Zhe Liu and Min Luo and Kim-Kwang Raymond Choo", title = "A Software\slash Hardware Co-Design of {Crystals-Dilithium} Signature Scheme", journal = j-TRETS, volume = "14", number = "2", pages = "11:1--11:21", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447812", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jul 16 07:17:05 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3447812", abstract = "As quantum computers become more affordable and commonplace, existing security systems that are based on classical cryptographic primitives, such as RSA and Elliptic Curve Cryptography (ECC), will no longer be secure. Hence, there has been interest in designing post-quantum cryptographic (PQC) schemes, such as those based on lattice-based cryptography (LBC). The potential of LBC schemes is evidenced by the number of such schemes passing the selection of NIST PQC Standardization Process Round-3. One such scheme is the Crystals-Dilithium signature scheme, which is based on the hard module-lattice problem. However, there is no efficient implementation of the Crystals-Dilithium signature scheme. Hence, in this article, we present a compact hardware architecture containing elaborate modular multiplication units using the Karatsuba algorithm along with smart generators of address sequence and twiddle factors for NTT, which can complete polynomial addition/multiplication with the parameter setting of Dilithium in a short clock period. Also, we propose a fast software/hardware co-design implementation on Field Programmable Gate Array (FPGA) for the Dilithium scheme with a tradeoff between speed and resource utilization. Our co-design implementation outperforms a pure C implementation on a Nios-II processor of the platform Altera DE2-115, in the sense that our implementation is 11.2 and 7.4 times faster for signature and verification, respectively. In addition, we also achieve approximately 51\% and 31\% speed improvement for signature and verification, in comparison to the pure C implementation on processor ARM Cortex-A9 of ZYNQ-7020 platform.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yasudo:2021:APE, author = "Ryota Yasudo and Jos{\'e} G. F. Coutinho and Ana-Lucia Varbanescu and Wayne Luk and Hideharu Amano and Tobias Becker and Ce Guo", title = "Analytical Performance Estimation for Large-Scale Reconfigurable Dataflow Platforms", journal = j-TRETS, volume = "14", number = "3", pages = "12:1--12:21", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3452742", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 21 07:50:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3452742", abstract = "Next-generation high-performance computing platforms will handle extreme data- and compute-intensive problems that are intractable with today's technology. A promising path in achieving the next leap in high-performance computing is to embrace \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Taka:2021:PVA, author = "Endri Taka and Konstantinos Maragos and George Lentaris and Dimitrios Soudris", title = "Process Variability Analysis in Interconnect, Logic, and Arithmetic Blocks of 16-nm {FinFET FPGAs}", journal = j-TRETS, volume = "14", number = "3", pages = "13:1--13:30", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458843", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 21 07:50:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3458843", abstract = "In the current work, we study the process variability of logic, interconnect, and arithmetic/DSP resources in commercial 16-nm FPGAs. We create multiple, soft-macro sensors for each distinct resource under evaluation, and we deploy them across the FPGA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sasongko:2021:HCS, author = "Arif Sasongko and I. M. Narendra Kumara and Arief Wicaksana and Fr{\'e}d{\'e}ric Rousseau and Olivier Muller", title = "Hardware Context Switch-based Cryptographic Accelerator for Handling Multiple Streams", journal = j-TRETS, volume = "14", number = "3", pages = "14:1--14:25", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460941", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 21 07:50:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3460941", abstract = "The confidentiality and integrity of a stream has become one of the biggest issues in telecommunication. The best available algorithm handling the confidentiality of a data stream is the symmetric key block cipher combined with a chaining mode of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Reggiani:2021:ESM, author = "Enrico Reggiani and Emanuele {Del Sozzo} and Davide Conficconi and Giuseppe Natale and Carlo Moroni and Marco D. Santambrogio", title = "Enhancing the Scalability of Multi-{FPGA} Stencil Computations via Highly Optimized {HDL} Components", journal = j-TRETS, volume = "14", number = "3", pages = "15:1--15:33", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3461478", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 21 07:50:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3461478", abstract = "Stencil-based algorithms are a relevant class of computational kernels in high-performance systems, as they appear in a plethora of fields, from image processing to seismic simulations, from numerical methods to physical modeling. Among the various \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Al-Hyari:2021:DLF, author = "Abeer Al-Hyari and Hannah Szentimrey and Ahmed Shamli and Timothy Martin and Gary Gr{\'e}wal and Shawki Areibi", title = "A Deep Learning Framework to Predict Routability for {FPGA} Circuit Placement", journal = j-TRETS, volume = "14", number = "3", pages = "16:1--16:28", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3465373", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 21 07:50:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3465373", abstract = "The ability to accurately and efficiently estimate the routability of a circuit based on its placement is one of the most challenging and difficult tasks in the Field Programmable Gate Array (FPGA) flow. In this article, we present a novel, deep learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lai:2021:PSS, author = "Yi-Hsiang Lai and Ecenur Ustun and Shaojie Xiang and Zhenman Fang and Hongbo Rong and Zhiru Zhang", title = "Programming and Synthesis for Software-defined {FPGA} Acceleration: Status and Future Prospects", journal = j-TRETS, volume = "14", number = "4", pages = "17:1--17:39", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3469660", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3469660", abstract = "FPGA-based accelerators are increasingly popular across a broad range of applications, because they offer massive parallelism, high energy efficiency, and great flexibility for customizations. However, difficulties in programming and integrating FPGAs \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yang:2021:BWB, author = "Tao Yang and Zhezhi He and Tengchuan Kou and Qingzheng Li and Qi Han and Haibao Yu and Fangxin Liu and Yun Liang and Li Jiang", title = "{BISWSRBS}: a {Winograd}-based {CNN} Accelerator with a Fine-grained Regular Sparsity Pattern and Mixed Precision Quantization", journal = j-TRETS, volume = "14", number = "4", pages = "18:1--18:28", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3467476", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3467476", abstract = "Field-programmable Gate Array (FPGA) is a high-performance computing platform for Convolution Neural Networks (CNNs) inference. Winograd algorithm, weight pruning, and quantization are widely adopted to reduce the storage and arithmetic overhead of CNNs \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wijtvliet:2021:CER, author = "Mark Wijtvliet and Henk Corporaal and Akash Kumar", title = "{CGRA-EAM-Rapid} Energy and Area Estimation for Coarse-grained Reconfigurable Architectures", journal = j-TRETS, volume = "14", number = "4", pages = "19:1--19:28", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468874", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3468874", abstract = "Reconfigurable architectures are quickly gaining in popularity due to their flexibility and ability to provide high energy efficiency. However, reconfigurable systems allow for a huge design space. Iterative design space exploration (DSE) is often \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gu:2021:DGB, author = "Zhenghua Gu and Wenqing Wan and Jundong Xie and Chang Wu", title = "Dependency Graph-based High-level Synthesis for Maximum Instruction Parallelism", journal = j-TRETS, volume = "14", number = "4", pages = "20:1--20:15", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468875", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3468875", abstract = "Performance optimization is an important goal for High-level Synthesis (HLS). Existing HLS scheduling algorithms are all based on Control and Data Flow Graph (CDFG) and will schedule basic blocks in sequential order. Our study shows that the sequential \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hung:2021:AGF, author = "Jos{\'e} Romero Hung and Chao Li and Pengyu Wang and Chuanming Shao and Jinyang Guo and Jing Wang and Guoyong Shi", title = "{ACE-GCN}: a Fast Data-driven {FPGA} Accelerator for {GCN} Embedding", journal = j-TRETS, volume = "14", number = "4", pages = "21:1--21:23", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3470536", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3470536", abstract = "ACE-GCN is a fast and resource/energy-efficient FPGA accelerator for graph convolutional embedding under data-driven and in-place processing conditions. Our accelerator exploits the inherent power law distribution and high sparsity commonly exhibited by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sabogal:2021:RFR, author = "Sebastian Sabogal and Alan George and Gary Crum", title = "Reconfigurable Framework for Resilient Semantic Segmentation for Space Applications", journal = j-TRETS, volume = "14", number = "4", pages = "22:1--22:32", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3472770", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Sep 21 07:21:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3472770", abstract = "Deep learning (DL) presents new opportunities for enabling spacecraft autonomy, onboard analysis, and intelligent applications for space missions. However, DL applications are computationally intensive and often infeasible to deploy on radiation-hardened \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shannon:2022:ISS, author = "Lesley Shannon", title = "Introduction to Special Section on {FPGA 2020}", journal = j-TRETS, volume = "15", number = "1", pages = "1:1--1:2", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485586", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3485586", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rybalkin:2022:WMG, author = "Vladimir Rybalkin and Jonas Ney and Menbere Kina Tekleyohannes and Norbert Wehn", title = "When Massive {GPU} Parallelism Ain't Enough: a Novel Hardware Architecture of {$2$D-LSTM} Neural Network", journal = j-TRETS, volume = "15", number = "1", pages = "2:1--2:35", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3469661", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3469661", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Papaphilippou:2022:HHP, author = "Philippos Papaphilippou and Jiuxi Meng and Nadeen Gebara and Wayne Luk", title = "{Hipernetch}: High-Performance {FPGA} Network Switch", journal = j-TRETS, volume = "15", number = "1", pages = "3:1--3:31", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477054", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3477054", abstract = "We present Hipernetch, a novel FPGA-based design for performing high-bandwidth network switching. FPGAs have recently become more popular in data centers due to their promising capabilities for a wide range of applications. With the recent surge in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Josipovic:2022:BPS, author = "Lana Josipovi{\'c} and Shabnam Sheikhha and Andrea Guerrieri and Paolo Ienne and Jordi Cortadella", title = "Buffer Placement and Sizing for High-Performance Dataflow Circuits", journal = j-TRETS, volume = "15", number = "1", pages = "4:1--4:32", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477053", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3477053", abstract = "Commercial high-level synthesis tools typically produce statically scheduled circuits. Yet, effective C-to-circuit conversion of arbitrary software applications calls for dataflow circuits, as they can handle efficiently variable latencies (e.g., caches), \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gross:2022:ESF, author = "Mathieu Gross and Konrad Hohentanner and Stefan Wiehler and Georg Sigl", title = "Enhancing the Security of {FPGA-SoCs} via the Usage of {ARM TrustZone} and a Hybrid-{TPM}", journal = j-TRETS, volume = "15", number = "1", pages = "5:1--5:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472959", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3472959", abstract = "Isolated execution is a concept commonly used for increasing the security of a computer system. In the embedded world, ARM TrustZone technology enables this goal and is currently used on mobile devices for applications such as secure payment or biometric \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wu:2022:LPF, author = "Chen Wu and Mingyu Wang and Xinyuan Chu and Kun Wang and Lei He", title = "Low-precision Floating-point Arithmetic for High-performance {FPGA}-based {CNN} Acceleration", journal = j-TRETS, volume = "15", number = "1", pages = "6:1--6:21", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474597", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3474597", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2022:NTE, author = "Deming Chen", title = "Note from the {TRETS EiC} about the new Journal-first track in {FPT'21}", journal = j-TRETS, volume = "15", number = "1", pages = "7e:1--7e:1", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501280", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3501280", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7e", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Streit:2022:DET, author = "Franz-Josef Streit and Paul Kr{\"u}ger and Andreas Becher and Stefan Wildermann and J{\"u}rgen Teich", title = "Design and Evaluation of a Tunable {PUF} Architecture for {FPGAs}", journal = j-TRETS, volume = "15", number = "1", pages = "7:1--7:27", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491237", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491237", abstract = "FPGA-based Physical Unclonable Functions (PUF) have emerged as a viable alternative to permanent key storage by turning effects of inaccuracies during the manufacturing process of a chip into a unique, FPGA-intrinsic secret. However, many fixed PUF \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhou:2022:ROS, author = "Yun Zhou and Pongstorn Maidee and Chris Lavin and Alireza Kaviani and Dirk Stroobandt", title = "{RWRoute}: an Open-source Timing-driven Router for Commercial {FPGAs}", journal = j-TRETS, volume = "15", number = "1", pages = "8:1--8:27", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491236", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491236", abstract = "One of the key obstacles to pervasive deployment of FPGA accelerators in data centers is their cumbersome programming model. Open source tooling is suggested as a way to develop alternative EDA tools to remedy this issue. Open source FPGA CAD tools have \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rasoulinezhad:2022:REB, author = "Seyedramin Rasoulinezhad and Esther Roorda and Steve Wilton and Philip H. W. Leong and David Boland", title = "Rethinking Embedded Blocks for Machine Learning Applications", journal = j-TRETS, volume = "15", number = "1", pages = "9:1--9:30", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491234", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491234", abstract = "The underlying goal of FPGA architecture research is to devise flexible substrates that implement a wide variety of circuits efficiently. Contemporary FPGA architectures have been optimized to support networking, signal processing, and image processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Menzel:2022:SSA, author = "Johannes Menzel and Christian Plessl and Tobias Kenter", title = "The Strong Scaling Advantage of {FPGAs} in {HPC} for {$N$}-body Simulations", journal = j-TRETS, volume = "15", number = "1", pages = "10:1--10:30", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491235", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Jan 28 07:03:50 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491235", abstract = "N-body methods are one of the essential algorithmic building blocks of high-performance and parallel computing. Previous research has shown promising performance for implementing n-body simulations with pairwise force calculations on FPGAs. However, to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Eguro:2022:ISIa, author = "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna and Hongbo Rong", title = "Introduction to Special Issue on {FPGAs} in Data Centers", journal = j-TRETS, volume = "15", number = "2", pages = "11:1--11:2", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3493607", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3493607", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Keller:2022:ITR, author = "Andrew M. Keller and Michael J. Wirthlin", title = "The Impact of Terrestrial Radiation on {FPGAs} in Data Centers", journal = j-TRETS, volume = "15", number = "2", pages = "12:1--12:21", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3457198", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3457198", abstract = "Field programmable gate arrays (FPGAs) are used in large numbers in data centers around the world. They are used for cloud computing and computer networking. The most common type of FPGA used in data centers are re-programmable SRAM-based FPGAs. These \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Asiatici:2022:RCS, author = "Mikhail Asiatici and Paolo Ienne", title = "Request, Coalesce, Serve, and Forget: Miss-Optimized Memory Systems for Bandwidth-Bound Cache-Unfriendly Applications on {FPGAs}", journal = j-TRETS, volume = "15", number = "2", pages = "13:1--13:33", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3466823", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3466823", abstract = "Applications such as large-scale sparse linear algebra and graph analytics are challenging to accelerate on FPGAs due to the short irregular memory accesses, resulting in low cache hit rates. Nonblocking caches reduce the bandwidth required by misses by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dogan:2022:CBB, author = "Atakan Dogan and Kemal Ebcioglu", title = "Cloud Building Block Chip for Creating {FPGA} and {ASIC} Clouds", journal = j-TRETS, volume = "15", number = "2", pages = "14:1--14:35", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3466822", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3466822", abstract = "Hardware-accelerated cloud computing systems based on FPGA chips (FPGA cloud) or ASIC chips (ASIC cloud) have emerged as a new technology trend for power-efficient acceleration of various software applications. However, the operating systems and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Alonso:2022:EDS, author = "Tobias Alonso and Lucian Petrica and Mario Ruiz and Jakoba Petri-Koenig and Yaman Umuroglu and Ioannis Stamelos and Elias Koromilas and Michaela Blott and Kees Vissers", title = "{Elastic-DF}: Scaling Performance of {DNN} Inference in {FPGA} Clouds through Automatic Partitioning", journal = j-TRETS, volume = "15", number = "2", pages = "15:1--15:34", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3470567", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3470567", abstract = "Customized compute acceleration in the datacenter is key to the wider roll-out of applications based on deep neural network (DNN) inference. In this article, we investigate how to maximize the performance and scalability of field-programmable gate array \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Salamat:2022:NGN, author = "Sahand Salamat and Hui Zhang and Yang Seok Ki and Tajana Rosing", title = "\pkg{NASCENT2}: Generic Near-Storage Sort Accelerator for Data Analytics on {SmartSSD}", journal = j-TRETS, volume = "15", number = "2", pages = "16:1--16:29", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472769", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3472769", abstract = "As the size of data generated every day grows dramatically, the computational bottleneck of computer systems has shifted toward storage devices. The interface between the storage and the computational platforms has become the main limitation due to its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Damiani:2022:BFS, author = "Andrea Damiani and Giorgia Fiscaletti and Marco Bacis and Rolando Brondolin and Marco D. Santambrogio", title = "\pkg{BlastFunction}: a Full-stack Framework Bringing {FPGA} Hardware Acceleration to Cloud-native Applications", journal = j-TRETS, volume = "15", number = "2", pages = "17:1--17:27", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472958", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3472958", abstract = "``Cloud-native'' is the umbrella adjective describing the standard approach for developing applications that exploit cloud infrastructures' scalability and elasticity at their best. As the application complexity and user-bases grow, designing for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{DAlberto:2022:XID, author = "Paolo D'Alberto and Victor Wu and Aaron Ng and Rahul Nimaiyar and Elliott Delaye and Ashish Sirasao", title = "\pkg{xDNN}: Inference for Deep Convolutional Neural Networks", journal = j-TRETS, volume = "15", number = "2", pages = "18:1--18:29", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473334", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3473334", abstract = "We present xDNN, an end-to-end system for deep-learning inference based on a family of specialized hardware processors synthesized on Field-Programmable Gate Array (FPGAs) and Convolution Neural Networks (CNN). We present a design optimized for low \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mbongue:2022:DMT, author = "Joel Mandebi Mbongue and Danielle Tchuinkou Kwadjo and Alex Shuping and Christophe Bobda", title = "Deploying Multi-tenant {FPGAs} within {Linux}-based Cloud Infrastructure", journal = j-TRETS, volume = "15", number = "2", pages = "19:1--19:31", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474058", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/linux.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", URL = "https://dl.acm.org/doi/10.1145/3474058", abstract = "Cloud deployments now increasingly exploit Field-Programmable Gate Array (FPGA) accelerators as part of virtual instances. While cloud FPGAs are still essentially single-tenant, the growing demand for efficient hardware acceleration paves the way to FPGA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hogervorst:2022:HAH, author = "Tom Hogervorst and Razvan Nane and Giacomo Marchiori and Tong Dong Qiu and Markus Blatt and Alf Birger Rustad", title = "Hardware Acceleration of High-Performance Computational Flow Dynamics Using High-Bandwidth Memory-Enabled Field-Programmable Gate Arrays", journal = j-TRETS, volume = "15", number = "2", pages = "20:1--20:35", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3476229", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3476229", abstract = "Scientific computing is at the core of many High-Performance Computing applications, including computational flow dynamics. Because of the utmost importance to simulate increasingly larger computational models, hardware acceleration is receiving increased \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sun:2022:BEC, author = "Gongjin Sun and Seongyoung Kang and Sang-Woo Jun", title = "\pkg{BurstZ+}: Eliminating The Communication Bottleneck of Scientific Computing Accelerators via Accelerated Compression", journal = j-TRETS, volume = "15", number = "2", pages = "21:1--21:34", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3476831", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 2 08:59:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3476831", abstract = "We present BurstZ+, an accelerator platform that eliminates the communication bottleneck between PCIe-attached scientific computing accelerators and their host servers, via hardware-optimized compression. While accelerators such as GPUs and FPGAs provide \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Eguro:2022:ISIb, author = "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna and Hongbo Rong", title = "Introduction to Special Issue on {FPGAs} in Data Centers, {Part II}", journal = j-TRETS, volume = "15", number = "3", pages = "22:1--22:2", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495231", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3495231", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tarafdar:2022:AOF, author = "Naif Tarafdar and Giuseppe {Di Guglielmo} and Philip C. Harris and Jeffrey D. Krupa and Vladimir Loncar and Dylan S. Rankin and Nhan Tran and Zhenbin Wu and Qianfeng Shen and Paul Chow", title = "{AIgean}: an Open Framework for Deploying Machine Learning on Heterogeneous Clusters", journal = j-TRETS, volume = "15", number = "3", pages = "23:1--23:32", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3482854", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3482854", abstract = "AIgean, pronounced like the sea, is an open framework to build and deploy machine learning (ML) algorithms on a heterogeneous cluster of devices (CPUs and FPGAs). We leverage two open source projects: Galapagos, for multi-FPGA deployment, and hls4ml, for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zeng:2022:UFV, author = "Shulin Zeng and Guohao Dai and Hanbo Sun and Jun Liu and Shiyao Li and Guangjun Ge and Kai Zhong and Kaiyuan Guo and Yu Wang and Huazhong Yang", title = "A Unified {FPGA} Virtualization Framework for General-Purpose Deep Neural Networks in the Cloud", journal = j-TRETS, volume = "15", number = "3", pages = "24:1--24:31", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3480170", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", URL = "https://dl.acm.org/doi/10.1145/3480170", abstract = "INFerence-as-a-Service (INFaaS) has become a primary workload in the cloud. However, existing FPGA-based Deep Neural Network (DNN) accelerators are mainly optimized for the fastest speed of a single task, while the multi-tenancy of INFaaS has not been explored yet. As the demand for INFaaS keeps growing, simply increasing the number of FPGA-based DNN accelerators is not cost-effective, while merely sharing these single-task optimized DNN accelerators in a time-division multiplexing way could lead to poor isolation and high-performance loss for INFaaS. On the other hand, current cloud-based DNN accelerators have excessive compilation overhead, especially when scaling out to multi-FPGA systems for multi-tenant sharing, leading to unacceptable compilation costs for both offline deployment and online reconfiguration. Therefore, it is far from providing efficient and flexible FPGA virtualization for public and private cloud scenarios.\par Aiming to solve these problems, we propose a unified virtualization framework for general-purpose deep neural networks in the cloud, enabling multi-tenant sharing for both the Convolution Neural Network (CNN), and the Recurrent Neural Network (RNN) accelerators on a single FPGA. The isolation is enabled by introducing a two-level instruction dispatch module and a multi-core based hardware resources pool. Such designs provide isolated and runtime-programmable hardware resources, which further leads to performance isolation for multi-tenant sharing. On the other hand, to overcome the heavy re-compilation overheads, a tiling-based instruction frame package design and a two-stage static-dynamic compilation, are proposed. Only the lightweight runtime information is re-compiled with $ \approx $1 ms overhead, thus guaranteeing the private cloud's performance. Finally, the extensive experimental results show that the proposed virtualized solutions achieve up to $ 3.12 \times $ and $ 6.18 \times $ higher throughput in the private cloud compared with the static CNN and RNN baseline designs, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Alachiotis:2022:SPR, author = "Nikolaos Alachiotis and Panagiotis Skrimponis and Manolis Pissadakis and Dionisios Pnevmatikatos", title = "Scalable Phylogeny Reconstruction with Disaggregated Near-memory Processing", journal = j-TRETS, volume = "15", number = "3", pages = "25:1--25:32", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3484983", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3484983", abstract = "Disaggregated computer architectures eliminate resource fragmentation in next-generation datacenters by enabling virtual machines to employ resources such as CPUs, memory, and accelerators that are physically located on different servers. While this paves \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Brennsteiner:2022:RTD, author = "Stefan Brennsteiner and Tughrul Arslan and John Thompson and Andrew McCormick", title = "A Real-Time Deep Learning {OFDM} Receiver", journal = j-TRETS, volume = "15", number = "3", pages = "26:1--26:25", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494049", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3494049", abstract = "Machine learning in the physical layer of communication systems holds the potential to improve performance and simplify design methodology. Many algorithms have been proposed; however, the model complexity is often unfeasible for real-time deployment. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lienen:2022:DDR, author = "Christian Lienen and Marco Platzner", title = "Design of Distributed Reconfigurable Robotics Systems with {ReconROS}", journal = j-TRETS, volume = "15", number = "3", pages = "27:1--27:20", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494571", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3494571", abstract = "Robotics applications process large amounts of data in real time and require compute platforms that provide high performance and energy efficiency. FPGAs are well suited for many of these applications, but there is a reluctance in the robotics community \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cahill:2022:AFD, author = "Eli Cahill and Brad Hutchings and Jeffrey Goeders", title = "Approaches for {FPGA} Design Assurance", journal = j-TRETS, volume = "15", number = "3", pages = "28:1--28:29", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491233", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491233", abstract = "Field-Programmable Gate Arrays (FPGAs) are widely used for custom hardware implementations, including in many security-sensitive industries, such as defense, communications, transportation, medical, and more. Compiling source hardware descriptions to FPGA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Faraji:2022:ACC, author = "S. Rasoul Faraji and Pierre Abillama and Kia Bazargan", title = "Approximate Constant-Coefficient Multiplication Using Hybrid Binary-Unary Computing for {FPGAs}", journal = j-TRETS, volume = "15", number = "3", pages = "29:1--29:25", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494570", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3494570", abstract = "Multipliers are used in virtually all Digital Signal Processing (DSP) applications such as image and video processing. Multiplier efficiency has a direct impact on the overall performance of such applications, especially when real-time processing is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Du:2022:BAB, author = "Gaoming Du and Bangyi Chen and Zhenmin Li and Zhenxing Tu and Junjie Zhou and Shenya Wang and Qinghao Zhao and Yongsheng Yin and Xiaolei Wang", title = "A {BNN} Accelerator Based on Edge-skip-calculation Strategy and Consolidation Compressed Tree", journal = j-TRETS, volume = "15", number = "3", pages = "30:1--30:20", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494569", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3494569", abstract = "Binarized neural networks (BNNs) and batch normalization (BN) have already become typical techniques in artificial intelligence today. Unfortunately, the massive accumulation and multiplication in BNN models bring challenges to field-programmable gate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dewald:2022:ILP, author = "Florian Dewald and Johanna Rohde and Christian Hochberger and Heiko Mantel", title = "Improving Loop Parallelization by a Combination of Static and Dynamic Analyses in {HLS}", journal = j-TRETS, volume = "15", number = "3", pages = "31:1--31:31", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501801", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3501801", abstract = "High-level synthesis (HLS) can be used to create hardware accelerators for compute-intense software parts such as loop structures. Usually, this process requires significant amount of user interaction to steer kernel selection and optimizations. This can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Matthews:2022:QDR, author = "Eric Matthews and Alec Lu and Zhenman Fang and Lesley Shannon", title = "{Quick-Div}: Rethinking Integer Divider Design for {FPGA}-based Soft-processors", journal = j-TRETS, volume = "15", number = "3", pages = "32:1--32:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3502492", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3502492", abstract = "In today's FPGA-based soft-processors, one of the slowest instructions is integer division. Compared to the low single-digit latency of other arithmetic operations, the fixed 32-cycle latency of radix-2 division is substantially longer. Given that today's soft-processors typically only implement radix-2 division --- if they support hardware division at all --- there is significant potential to improve the performance of integer dividers.\par In this work, we present a set of high-performance, data-dependent, variable-latency integer dividers for FPGA-based soft-processors that we call Quick-Div. We compare them to various radix-N dividers and provide a thorough analysis in terms of latency and resource usage. In addition, we analyze the frequency scaling for such divider designs when (1) treated as a stand-alone unit and (2) integrated as part of a high-performance soft-processor. Moreover, we provide additional theoretical analysis of different dividers' behaviour and develop a new better-performing Quick-Div variant, called Quick-radix-4. Experimental results show that our Quick-radix-4 design can achieve up to $ 6.8 \times $ better performance and $ 6.1 \times $ better performance-per-LUT over the radix-2 divider for applications such as random number generation. Even in cases where division operations constitute as little as 1\% of all executed instructions, Quick-radix-4 provides a performance uplift of 16\% compared to the radix-2 divider.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "32", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Roorda:2022:FAE, author = "Esther Roorda and Seyedramin Rasoulinezhad and Philip H. W. Leong and Steven J. E. Wilton", title = "{FPGA} Architecture Exploration for {DNN} Acceleration", journal = j-TRETS, volume = "15", number = "3", pages = "33:1--33:37", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3503465", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3503465", abstract = "Recent years have seen an explosion of machine learning applications implemented on Field-Programmable Gate Arrays (FPGAs). FPGA vendors and researchers have responded by updating their fabrics to more efficiently implement machine learning accelerators, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "33", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bobda:2022:FFA, author = "Christophe Bobda and Joel Mandebi Mbongue and Paul Chow and Mohammad Ewais and Naif Tarafdar and Juan Camilo Vega and Ken Eguro and Dirk Koch and Suranga Handagala and Miriam Leeser and Martin Herbordt and Hafsah Shahzad and Peter Hofste and Burkhard Ringlein and Jakub Szefer and Ahmed Sanaullah and Russell Tessier", title = "The Future of {FPGA} Acceleration in Datacenters and the Cloud", journal = j-TRETS, volume = "15", number = "3", pages = "34:1--34:42", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506713", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue May 24 07:29:32 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3506713", abstract = "In this article, we survey existing academic and commercial efforts to provide Field-Programmable Gate Array (FPGA) acceleration in datacenters and the cloud. The goal is a critical review of existing systems and a discussion of their evolution from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "34", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mentens:2022:ISS, author = "Nele Mentens and Lionel Sousa and Pedro Trancoso", title = "Introduction to the Special Section on {FPL 2020}", journal = j-TRETS, volume = "15", number = "4", pages = "35:1--35:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3536336", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3536336", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "35", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shi:2022:EHF, author = "Runbin Shi and Kaan Kara and Christoph Hagleitner and Dionysios Diamantopoulos and Dimitris Syrivelis and Gustavo Alonso", title = "Exploiting {HBM} on {FPGAs} for Data Processing", journal = j-TRETS, volume = "15", number = "4", pages = "36:1--36:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491238", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3491238", abstract = "Field Programmable Gate Arrays (FPGAs) are increasingly being used in data centers and the cloud due to their potential to accelerate certain workloads as well as for their architectural flexibility, since they can be used as accelerators, smart-NICs, or \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "36", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nikolic:2022:DPD, author = "Stefan Nikoli{\'c} and Grace Zgheib and Paolo Ienne", title = "Detailed Placement for Dedicated {LUT}-Level {FPGA} Interconnect", journal = j-TRETS, volume = "15", number = "4", pages = "37:1--37:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501802", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3501802", abstract = "In this work, we develop timing-driven CAD support for FPGA architectures with direct connections between LUTs. We do so by proposing an efficient ILP-based detailed placer, which moves a carefully selected subset of LUTs from their original positions, so \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "37", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2022:RFH, author = "Niansong Zhang and Xiang Chen and Nachiket Kapre", title = "{RapidLayout}: Fast Hard Block Placement of {FPGA}-optimized Systolic Arrays Using Evolutionary Algorithm", journal = j-TRETS, volume = "15", number = "4", pages = "38:1--38:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501803", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3501803", abstract = "Evolutionary algorithms can outperform conventional placement algorithms such as simulated annealing, analytical placement, and manual placement on runtime, wirelength, pipelining cost, and clock frequency when mapping hard block intensive designs such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "38", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Singh:2022:AWP, author = "Gagandeep Singh and Dionysios Diamantopoulos and Juan G{\'o}mez-Luna and Christoph Hagleitner and Sander Stuijk and Henk Corporaal and Onur Mutlu", title = "Accelerating Weather Prediction Using Near-Memory Reconfigurable Fabric", journal = j-TRETS, volume = "15", number = "4", pages = "39:1--39:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501804", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3501804", abstract = "Ongoing climate change calls for fast and accurate weather and climate modeling. However, when solving large-scale weather prediction simulations, state-of-the-art CPU and GPU implementations suffer from limited performance and high energy consumption. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "39", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Malik:2022:HEA, author = "Gurshaant Malik and Ian Elmore Lang and Rodolfo Pellizzoni and Nachiket Kapre", title = "{HopliteML}: Evolving Application Customized {FPGA NoCs} with Adaptable Routers and Regulators", journal = j-TRETS, volume = "15", number = "4", pages = "40:1--40:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3507699", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3507699", abstract = "We can overcome the pessimism in worst-case routing latency analysis of timing-predictable Network-on-Chip (NoC) workloads by single-digit factors through the use of a hybrid field-programmable gate array (FPGA)-optimized NoC and workload-adapted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "40", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cook:2022:INU, author = "Hayden Cook and Jacob Arscott and Brent George and Tanner Gaskin and Jeffrey Goeders and Brad Hutchings", title = "Inducing Non-uniform {FPGA} Aging Using Configuration-based Short Circuits", journal = j-TRETS, volume = "15", number = "4", pages = "41:1--41:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3517042", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3517042", abstract = "This work demonstrates a novel method of accelerating FPGA aging by configuring FPGAs to implement thousands of short circuits, resulting in high on-chip currents and temperatures. Patterns of ring oscillators are placed across the chip and are used to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "41", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leong:2022:ISS, author = "Philip H. W. Leong", title = "Introduction to Special Section on {FPGA} 2021", journal = j-TRETS, volume = "15", number = "4", pages = "42:1--42:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3536335", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3536335", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "42", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2022:DSH, author = "Alec Lu and Zhenman Fang and Lesley Shannon", title = "Demystifying the Soft and Hardened Memory Systems of Modern {FPGAs} for Software Programmers through Microbenchmarking", journal = j-TRETS, volume = "15", number = "4", pages = "43:1--43:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3517131", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3517131", abstract = "Both modern datacenter and embedded Field Programmable Gate Arrays (FPGAs) provide great opportunities for high-performance and high-energy-efficiency computing. With the growing public availability of FPGAs from major cloud service providers such as AWS, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "43", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2022:TRE, author = "Xinyu Chen and Feng Cheng and Hongshi Tan and Yao Chen and Bingsheng He and Weng-Fai Wong and Deming Chen", title = "{ThunderGP}: Resource-Efficient Graph Processing Framework on {FPGAs} with {HLS}", journal = j-TRETS, volume = "15", number = "4", pages = "44:1--44:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3517141", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3517141", abstract = "FPGA has been an emerging computing infrastructure in datacenters benefiting from fine-grained parallelism, energy efficiency, and reconfigurability. Meanwhile, graph processing has attracted tremendous interest in data analytics, and its performance is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "44", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Langhammer:2022:SNA, author = "Martin Langhammer and Eriko Nurvitadhi and Sergey Gribok and Bogdan Pasca", title = "{Stratix 10 NX} Architecture", journal = j-TRETS, volume = "15", number = "4", pages = "45:1--45:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3520197", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3520197", abstract = "The advent of AI has driven the exploration of high-density low-precision arithmetic on FPGAs. This has resulted in new methods in mapping both arithmetic functions as well as dataflows onto the fabric, as well as some changes to the embedded DSP Blocks. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "45", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Arora:2022:TSF, author = "Aman Arora and Moinak Ghosh and Samidh Mehta and Vaughn Betz and Lizy K. John", title = "Tensor Slices: {FPGA} Building Blocks For the {Deep Learning} Era", journal = j-TRETS, volume = "15", number = "4", pages = "46:1--46:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3529650", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3529650", abstract = "FPGAs are well-suited for accelerating deep learning (DL) applications owing to the rapidly changing algorithms, network architectures and computation requirements in this field. However, the generic building blocks available on traditional FPGAs limit \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "46", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ebcioglu:2022:HPM, author = "Kemal Ebcioglu and Ismail San", title = "Highly Parallel Multi-{FPGA} System Compilation from Sequential {C\slash C++} Code in the {AWS} Cloud", journal = j-TRETS, volume = "15", number = "4", pages = "47:1--47:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3507698", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3507698", abstract = "We present a High Level Synthesis compiler that automatically obtains a multi-chip accelerator system from a single-threaded sequential C/C++ application. Invoking the multi-chip accelerator is functionally identical to invoking the single-threaded \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "47", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Panchapakesan:2022:SEA, author = "Sathish Panchapakesan and Zhenman Fang and Jian Li", title = "{SyncNN}: Evaluating and Accelerating Spiking Neural Networks on {FPGAs}", journal = j-TRETS, volume = "15", number = "4", pages = "48:1--48:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3514253", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3514253", abstract = "Compared to conventional artificial neural networks, spiking neural networks (SNNs) are more biologically plausible and require less computation due to their event-driven nature of spiking neurons. However, the default asynchronous execution of SNNs also \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "48", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gibson:2022:ACM, author = "Kahlan Gibson and Esther Roorda and Daniel Holanda Noronha and Steven J. E. Wilton", title = "Adaptive Clock Management of {HLS}-generated Circuits on {FPGAs}", journal = j-TRETS, volume = "15", number = "4", pages = "49:1--49:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3520140", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3520140", abstract = "In this article, we present Syncopation, a performance-boosting fine-grained timing analysis and adaptive clock management technique for High-Level Synthesis-generated circuits implemented on Field-Programmable Gate Arrays. The key idea is to use the HLS \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "49", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sherwin:2022:MFF, author = "Krystine Dawn Sherwin and Kevin I-Kai Wang and Prabu Thiagaraj and Ben Stappers and Oliver Sinnen", title = "Median Filters on {FPGAs} for Infinite Data and Large, Rectangular Windows", journal = j-TRETS, volume = "15", number = "4", pages = "50:1--50:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3530273", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3530273", abstract = "Efficient architectures and implementations of median filters have been well investigated in the past. In this article, we focus on median filters for very big scientific applications with very large windows and an infinite stream of data, inspired by big \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "50", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cong:2022:FHT, author = "Jason Cong and Jason Lau and Gai Liu and Stephen Neuendorffer and Peichen Pan and Kees Vissers and Zhiru Zhang", title = "{FPGA HLS} Today: Successes, Challenges, and Opportunities", journal = j-TRETS, volume = "15", number = "4", pages = "51:1--51:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3530775", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:16 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3530775", abstract = "The year 2011 marked an important transition for FPGA high-level synthesis (HLS), as it went from prototyping to deployment. A decade later, in this article, we assess the progress of the deployment of HLS technology and highlight the successes in several \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "51", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sinnen:2023:ISS, author = "Oliver Sinnen and Qiang Liu and Azadeh Davoodi", title = "Introduction to Special Section on {FPT'20}", journal = j-TRETS, volume = "16", number = "1", pages = "1:1--1:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579850", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3579850", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shi:2023:OGR, author = "Kaichuang Shi and Xuegong Zhou and Hao Zhou and Lingli Wang", title = "An Optimized {GIB} Routing Architecture with Bent Wires for {FPGA}", journal = j-TRETS, volume = "16", number = "1", pages = "2:1--2:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3519599", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3519599", abstract = "Field-programmable gate arrays (FGPAs) are widely used because of the superiority in flexibility and lower non-recurring engineering cost. How to optimize the routing architecture is a key problem for FPGA architects because it has a large impact on FPGA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2023:JBA, author = "Xiang Li and Peter Stanwicks and George Provelengios and Russell Tessier and Daniel Holcomb", title = "Jitter-based Adaptive True Random Number Generation Circuits for {FPGAs} in the Cloud", journal = j-TRETS, volume = "16", number = "1", pages = "3:1--3:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3487554", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3487554", abstract = "In this article, we present and evaluate a true random number generator (TRNG) design that is compatible with the restrictions imposed by cloud-based Field Programmable Gate Array (FPGA) providers such as Amazon Web Services (AWS) EC2 F1. Because cloud \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Que:2023:RRM, author = "Zhiqiang Que and Hiroki Nakahara and Hongxiang Fan and He Li and Jiuxi Meng and Kuen Hung Tsoi and Xinyu Niu and Eriko Nurvitadhi and Wayne Luk", title = "{Remarn}: a Reconfigurable Multi-threaded Multi-core Accelerator for Recurrent Neural Networks", journal = j-TRETS, volume = "16", number = "1", pages = "4:1--4:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3534969", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3534969", abstract = "This work introduces Remarn, a reconfigurable multi-threaded multi-core accelerator supporting both spatial and temporal co-execution of Recurrent Neural Network (RNN) inferences. It increases processing capabilities and quality of service of cloud-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Schelten:2023:HTR, author = "Niklas Schelten and Fritjof Steinert and Justin Knapheide and Anton Schulte and Benno Stabernack", title = "A High-Throughput, Resource-Efficient Implementation of the {RoCEv2} Remote {DMA} Protocol and its Application", journal = j-TRETS, volume = "16", number = "1", pages = "5:1--5:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3543176", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3543176", abstract = "The use of application-specific accelerators in data centers has been the state of the art for at least a decade, starting with the availability of General Purpose GPUs achieving higher performance either overall or per watt. In most cases, these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Giechaskiel:2023:CVC, author = "Ilias Giechaskiel and Shanquan Tian and Jakub Szefer", title = "{Cross-VM} Covert- and Side-Channel Attacks in Cloud {FPGAs}", journal = j-TRETS, volume = "16", number = "1", pages = "6:1--6:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3534972", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3534972", abstract = "The availability of FPGAs in cloud data centers offers rapid, on-demand access to reconfigurable hardware compute resources that users can adapt to their own needs. However, the low-level access to the FPGA hardware and associated resources such as the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wolf:2023:ASE, author = "Dennis Leander Wolf and Christoph Spang and Daniel Diener and Christian Hochberger", title = "Advantages of a Statistical Estimation Approach for Clock Frequency Estimation of Heterogeneous and Irregular {CGRAs}", journal = j-TRETS, volume = "16", number = "1", pages = "7:1--7:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531062", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3531062", abstract = "Estimating the maximum clock frequency of homogeneous Coarse Grained Reconfigurable Arrays/Architectures (CGRAs) with an arbitrary number of Processing Elements (PE) is difficult. Clock frequency estimation of highly heterogeneous CGRAs takes additional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ioannou:2023:SOA, author = "Lenos Ioannou and Suhaib A. Fahmy", title = "Streaming Overlay Architecture for Lightweight {LSTM} Computation on {FPGA SoCs}", journal = j-TRETS, volume = "16", number = "1", pages = "8:1--8:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3543069", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3543069", abstract = "Long-Short Term Memory (LSTM) networks, and Recurrent Neural Networks (RNNs) in general, have demonstrated their suitability in many time series data applications, especially in Natural Language Processing (NLP). Computationally, LSTMs introduce \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2023:SSA, author = "Xiangwei Li and Douglas L. Maskell and Carol Jingyi Li and Philip H. W. Leong and David Boland", title = "A Scalable Systolic Accelerator for Estimation of the Spectral Correlation Density Function and Its {FPGA} Implementation", journal = j-TRETS, volume = "16", number = "1", pages = "9:1--9:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546181", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3546181", abstract = "The spectral correlation density (SCD) function is the time-averaged correlation of two spectral components used for analyzing periodic signals with time-varying spectral content. Although the analysis is extremely powerful, it has not been widely adopted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tao:2023:LGL, author = "Zhuofu Tao and Chen Wu and Yuan Liang and Kun Wang and Lei He", title = "{LW-GCN}: a Lightweight {FPGA}-based Graph Convolutional Network Accelerator", journal = j-TRETS, volume = "16", number = "1", pages = "10:1--10:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550075", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3550075", abstract = "Graph convolutional networks (GCNs) have been introduced to effectively process non-Euclidean graph data. However, GCNs incur large amounts of irregularity in computation and memory access, which prevents efficient use of traditional neural network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Moini:2023:VSI, author = "Shayan Moini and Aleksa Deric and Xiang Li and George Provelengios and Wayne Burleson and Russell Tessier and Daniel Holcomb", title = "Voltage Sensor Implementations for Remote Power Attacks on {FPGAs}", journal = j-TRETS, volume = "16", number = "1", pages = "11:1--11:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3555048", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3555048", abstract = "This article presents a study of two types of on-chip FPGA voltage sensors based on ring oscillators (ROs) and time-to-digital converter (TDCs), respectively. It has previously been shown that these sensors are often used to extract side-channel \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kalantar:2023:FBA, author = "Amin Kalantar and Zachary Zimmerman and Philip Brisk", title = "{FPGA}-based Acceleration of Time Series Similarity Prediction: From Cloud to Edge", journal = j-TRETS, volume = "16", number = "1", pages = "12:1--12:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3555810", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3555810", abstract = "With the proliferation of low-cost sensors and the Internet of Things, the rate of producing data far exceeds the compute and storage capabilities of today's infrastructure. Much of this data takes the form of time series, and in response, there has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vestias:2023:EDL, author = "M{\'a}rio V{\'e}stias and Rui P. Duarte and Jos{\'e} T. de Sousa and Hor{\'a}cio Neto", title = "Efficient Design of Low Bitwidth Convolutional Neural Networks on {FPGA} with Optimized Dot Product Units", journal = j-TRETS, volume = "16", number = "1", pages = "13:1--13:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546182", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3546182", abstract = "Designing hardware accelerators to run the inference of convolutional neural networks (CNN) is under intensive research. Several different architectures have been proposed along with hardware-oriented optimizations of the neural network models. One of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{deMoura:2023:DCR, author = "Rafael F{\~a}o de Moura and Joao Paulo Cardoso de Lima and Luigi Carro", title = "Data and Computation Reuse in {CNNs} Using Memristor {TCAMs}", journal = j-TRETS, volume = "16", number = "1", pages = "14:1--14:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3549536", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3549536", abstract = "Exploiting computational and data reuse in CNNs is crucial for the successful design of resource-constrained platforms. In image recognition applications, high levels of input locality and redundancy present in CNNs have become the golden goose for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abdelhamid:2023:SMC, author = "Riadh {Ben Abdelhamid} and Yoshiki Yamaguchi and Taisuke Boku", title = "A Scalable Many-core Overlay Architecture on an {HBM2}-enabled Multi-Die {FPGA}", journal = j-TRETS, volume = "16", number = "1", pages = "15:1--15:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3547657", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3547657", abstract = "The overlay architecture enables to raise the abstraction level of hardware design and enhances hardware-accelerated applications' portability. In FPGAs, there is a growing awareness of the overlay structure as typified by many-core architecture. It works \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Iskandar:2023:NMC, author = "Veronia Iskandar and Mohamed A. {Abd El Ghany} and Diana G{\"o}hringer", title = "Near-memory Computing on {FPGAs} with {$3$D}-stacked Memories: Applications, Architectures, and Optimizations", journal = j-TRETS, volume = "16", number = "1", pages = "16:1--16:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3547658", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3547658", abstract = "The near-memory computing (NMC) paradigm has transpired as a promising method for overcoming the memory wall challenges of future computing architectures. Modern systems integrating 3D-stacked DRAM memory can be leveraged to prevent unnecessary data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shahsavani:2023:ECM, author = "Soheil Nazar Shahsavani and Arash Fayyazi and Mahdi Nazemi and Massoud Pedram", title = "Efficient Compilation and Mapping of Fixed Function Combinational Logic onto Digital Signal Processors Targeting Neural Network Inference and Utilizing High-level Synthesis", journal = j-TRETS, volume = "16", number = "2", pages = "17:1--17:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3559543", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3559543", abstract = "Recent efforts for improving the performance of neural network (NN) accelerators that meet today's application requirements have given rise to a new trend of logic-based NN inference relying on fixed function combinational logic. Mapping such large \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Choi:2023:FAP, author = "Young-Kyu Choi and Carlos Santillana and Yujia Shen and Adnan Darwiche and Jason Cong", title = "{FPGA} Acceleration of Probabilistic Sentential Decision Diagrams with High-level Synthesis", journal = j-TRETS, volume = "16", number = "2", pages = "18:1--18:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561514", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3561514", abstract = "Probabilistic Sentential Decision Diagrams (PSDDs) provide efficient methods for modeling and reasoning with probability distributions in the presence of massive logical constraints. PSDDs can also be synthesized from graphical models such as Bayesian \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ganewattha:2023:HAR, author = "Chanaka Ganewattha and Zaheer Khan and Janne Lehtom{\"a}ki and Matti Latva-Aho", title = "Hardware-accelerated Real-time Drift-awareness for Robust Deep Learning on Wireless {RF} Data", journal = j-TRETS, volume = "16", number = "2", pages = "19:1--19:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563394", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3563394", abstract = "Proactive and intelligent management of network resource utilization (RU) using deep learning (DL) can significantly improve the efficiency and performance of the next generation of wireless networks. However, variations in wireless RU are often affected \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Proulx:2023:SFC, author = "Alexandre Proulx and Jean-Yves Chouinard and Paul Fortier and Amine Miled", title = "A Survey on {FPGA} Cybersecurity Design Strategies", journal = j-TRETS, volume = "16", number = "2", pages = "20:1--20:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561515", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3561515", abstract = "This article presents a critical literature review on the security aspects of field-programmable gate array (FPGA) devices. FPGA devices present unique challenges to cybersecurity through their reconfigurable nature. The article also pays special \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Soldavini:2023:ACH, author = "Stephanie Soldavini and Karl Friebel and Mattia Tibaldi and Gerald Hempel and Jeronimo Castrillon and Christian Pilato", title = "Automatic Creation of High-bandwidth Memory Architectures from Domain-specific Languages: The Case of Computational Fluid Dynamics", journal = j-TRETS, volume = "16", number = "2", pages = "21:1--21:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563553", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3563553", abstract = "Numerical simulations can help solve complex problems. Most of these algorithms are massively parallel and thus good candidates for FPGA acceleration thanks to spatial parallelism. Modern FPGA devices can leverage high-bandwidth memory technologies, but \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yang:2023:HOF, author = "Gangqiang Yang and Zhengyuan Shi and Cheng Chen and Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo Wan", title = "Hardware Optimizations of {Fruit-80} Stream Cipher: Smaller than Grain", journal = j-TRETS, volume = "16", number = "2", pages = "22:1--22:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569455", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3569455", abstract = "Fruit-80, which emerged as an ultra-lightweight stream cipher with 80-bit secret key, is oriented toward resource-constrained devices in the Internet of Things. In this article, we propose area and speed optimization architectures of Fruit-80 on FPGAs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Basalama:2023:FEE, author = "Suhail Basalama and Atefeh Sohrabizadeh and Jie Wang and Licheng Guo and Jason Cong", title = "{FlexCNN}: an End-to-end Framework for Composing {CNN} Accelerators on {FPGA}", journal = j-TRETS, volume = "16", number = "2", pages = "23:1--23:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570928", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3570928", abstract = "With reduced data reuse and parallelism, recent convolutional neural networks (CNNs) create new challenges for FPGA acceleration. Systolic arrays (SAs) are efficient, scalable architectures for convolutional layers, but without proper optimizations, their \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Meyer:2023:MFD, author = "Marius Meyer and Tobias Kenter and Christian Plessl", title = "{Multi-FPGA} Designs and Scaling of {HPC} Challenge Benchmarks via {MPI} and Circuit-switched Inter-{FPGA} Networks", journal = j-TRETS, volume = "16", number = "2", pages = "24:1--24:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3576200", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3576200", abstract = "While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ueno:2023:VVC, author = "Tomohiro Ueno and Kentaro Sano", title = "{VCSN}: Virtual Circuit-Switching Network for Flexible and Simple-to-Operate Communication in {HPC FPGA} Cluster", journal = j-TRETS, volume = "16", number = "2", pages = "25:1--25:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579848", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3579848", abstract = "FPGA clusters promise to play a critical role in high-performance computing (HPC) systems in the near future due to their flexibility and high power efficiency. The operation of large-scale general-purpose FPGA clusters on which multiple users run diverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nayak:2023:IEE, author = "Ankita Nayak and Keyi Zhang and Rajsekhar Setaluri and Alex Carsello and Makai Mann and Christopher Torng and Stephen Richardson and Rick Bahr and Pat Hanrahan and Mark Horowitz and Priyanka Raina", title = "Improving Energy Efficiency of {CGRAs} with Low-Overhead Fine-Grained Power Domains", journal = j-TRETS, volume = "16", number = "2", pages = "26:1--26:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3558394", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3558394", abstract = "To effectively minimize static power for a wide range of applications, power domains for coarse-grained reconfigurable array (CGRA) architectures need to be more fine-grained than those found in a typical application-specific integrated circuit. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhao:2023:ASC, author = "Kang Zhao and Yuchun Ma and Ruining He and Jixing Zhang and Ning Xu and Jinian Bian", title = "Adaptive Selection and Clustering of Partial Reconfiguration Modules for Modern {FPGA} Design Flow", journal = j-TRETS, volume = "16", number = "2", pages = "27:1--27:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567427", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3567427", abstract = "Dynamic Partially Reconfiguration (DPR) on FPGA has attracted significant research interest in recent years since it provides benefits such as reduced area and flexible functionality. However, due to the lack of supporting synthesis tools in the current \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tian:2023:SSA, author = "Xingyu Tian and Zhifan Ye and Alec Lu and Licheng Guo and Yuze Chi and Zhenman Fang", title = "{SASA}: a Scalable and Automatic Stencil Acceleration Framework for Optimized Hybrid Spatial and Temporal Parallelism on {HBM}-based {FPGAs}", journal = j-TRETS, volume = "16", number = "2", pages = "28:1--28:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572547", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3572547", abstract = "Stencil computation is one of the fundamental computing patterns in many application domains such as scientific computing and image processing. While there are promising studies that accelerate stencils on FPGAs, there lacks an automated acceleration \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{M:2023:DAR, author = "Dhayalakumar M. and Noor Mahammad Sk", title = "Deterministic Approach for Range-enhanced Reconfigurable Packet Classification Engine", journal = j-TRETS, volume = "16", number = "2", pages = "29:1--29:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3586577", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3586577", abstract = "Reconfigurable hardware is a promising technology for implementing firewalls, routing mechanisms, and new protocols for evolving high-performance network systems. This work presents a novel deterministic approach for a Range-enhanced Reconfigurable Packet \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Koch:2023:ISI, author = "Andreas Koch and Wei Zhang", title = "Introduction to the Special Issue on {FPT 2021}", journal = j-TRETS, volume = "16", number = "2", pages = "30:1--30:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603701", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3603701", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Attia:2023:TSL, author = "Sameh Attia and Vaughn Betz", title = "Toward Software-like Debugging for {FPGAs} via Checkpointing and Transaction-based Co-Simulation", journal = j-TRETS, volume = "16", number = "2", pages = "31:1--31:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3552521", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3552521", abstract = "Checkpoint-based debugging flows have recently been developed that allow the user to move the design state back and forth between an FPGA and a simulator. They provide a softwarelike debugging experience by combining the speed of hardware execution and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gebauer:2023:QMR, author = "Richard Gebauer and Nick Karcher and Mehmed G{\"u}ler and Oliver Sander", title = "{QiCells}: a Modular {RFSoC}-based Approach to Interface Superconducting Quantum Bits", journal = j-TRETS, volume = "16", number = "2", pages = "32:1--32:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571820", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3571820", abstract = "Quantum computers will be a revolutionary extension of the heterogeneous computing world. They consist of many quantum bits (qubits) and require a careful design of the interface between the classical computer architecture and the quantum processor. For \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "32", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Suh:2023:AHC, author = "Han-Sok Suh and Jian Meng and Ty Nguyen and Vijay Kumar and Yu Cao and Jae-Sun Seo", title = "Algorithm--hardware Co-optimization for Energy-efficient Drone Detection on Resource-constrained {FPGA}", journal = j-TRETS, volume = "16", number = "2", pages = "33:1--33:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3583074", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Mon Jul 3 07:48:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3583074", abstract = "Convolutional neural network (CNN)-based object detection has achieved very high accuracy; e.g., single-shot multi-box detectors (SSDs) can efficiently detect and localize various objects in an input image. However, they require a high amount of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "33", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bucknall:2023:ZEE, author = "Alex R. Bucknall and Suhaib A. Fahmy", title = "{ZyPR}: End-to-end Build Tool and Runtime Manager for Partial Reconfiguration of {FPGA SoCs} at the Edge", journal = j-TRETS, volume = "16", number = "3", pages = "34:1--34:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3585521", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3585521", abstract = "Partial reconfiguration (PR) is a key enabler to the design and development of adaptive systems on modern Field Programmable Gate Array (FPGA) Systems-on-Chip (SoCs), allowing hardware to be adapted dynamically at runtime. Vendor-supported PR \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "34", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Corts:2023:SPS, author = "Reinout Corts and Nikolaos Alachiotis", title = "A Survey of Processing Systems for Phylogenetics and Population Genetics", journal = j-TRETS, volume = "16", number = "3", pages = "35:1--35:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588033", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3588033", abstract = "The COVID-19 pandemic brought Bioinformatics into the spotlight, revealing that several existing methods, algorithms, and tools were not well prepared to handle large amounts of genomic data efficiently. This led to prohibitively long execution times and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "35", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Machado:2023:NNH, author = "Pedro Machado and Jo{\~a}o Filipe Ferreira and Andreas Oikonomou and T. M. McGinnity", title = "{NeuroHSMD}: Neuromorphic Hybrid Spiking Motion Detector", journal = j-TRETS, volume = "16", number = "3", pages = "36:1--36:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588318", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3588318", abstract = "Vertebrate retinas are highly-efficient in processing trivial visual tasks such as detecting moving objects, which still represent complex challenges for modern computers. In vertebrates, the detection of object motion is performed by specialised retinal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "36", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Geethakumari:2023:SAC, author = "Prajith Ramakrishnan Geethakumari and Ioannis Sourdis", title = "Stream Aggregation with Compressed Sliding {Windows}", journal = j-TRETS, volume = "16", number = "3", pages = "37:1--37:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3590774", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3590774", abstract = "High performance stream aggregation is critical for many emerging applications that analyze massive volumes of data. Incoming data needs to be stored in a sliding window during processing, in case the aggregation functions cannot be computed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "37", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Karakchi:2023:NND, author = "Rasha Karakchi and Jason D. Bakos", title = "{NAPOLY}: a Non-deterministic Automata Processor {OverLaY}", journal = j-TRETS, volume = "16", number = "3", pages = "38:1--38:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3593586", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3593586", abstract = "Deterministic and Non-deterministic Finite Automata (DFA and NFA) comprise the core of many big data applications. Recent efforts to develop Domain-Specific Architectures (DSAs) for DFA/NFA have taken divergent approaches, but achieving consistent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "38", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Raut:2023:EAE, author = "Gopal Raut and Saurabh Karkun and Santosh Kumar Vishvakarma", title = "An Empirical Approach to Enhance Performance for Scalable {CORDIC}-Based Deep Neural Networks", journal = j-TRETS, volume = "16", number = "3", pages = "39:1--39:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3596220", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3596220", abstract = "Practical implementation of deep neural networks (DNNs) demands significant hardware resources, necessitating high computational power and memory bandwidth. While existing field-programmable gate array (FPGA)-based DNN accelerators are primarily optimized \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "39", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leeser:2023:AEA, author = "Miriam Leeser", title = "Artifact Evaluation for {ACM TRETS} Papers Submitted from the {FPT} Journal Track", journal = j-TRETS, volume = "16", number = "3", pages = "40:1--40:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3596513", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3596513", abstract = "Authors of papers that were accepted to ACM TRETS via the FPT 2022 journal track had the option of participating in Artifact Evaluation (AE). Four papers from this track volunteered to participate in the AE process. All of these papers have been awarded \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "40", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2023:FPF, author = "Carol Jingyi Li and Xiangwei Li and Binglei Lou and Craig T. Jin and David Boland and Philip H. W. Leong", title = "Fixed-point {FPGA} Implementation of the {FFT} Accumulation Method for Real-time Cyclostationary Analysis", journal = j-TRETS, volume = "16", number = "3", pages = "41:1--41:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567429", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3567429", abstract = "The spectral correlation density (SCD) is an important tool in cyclostationary signal detection and classification. Even using efficient techniques based on the fast Fourier transform (FFT), real-time implementations are challenging because of the high \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "41", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lou:2023:FCF, author = "Binglei Lou and David Boland and Philip Leong", title = "{fSEAD}: a Composable {FPGA}-based Streaming Ensemble Anomaly Detection Library", journal = j-TRETS, volume = "16", number = "3", pages = "42:1--42:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3568992", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3568992", abstract = "Machine learning ensembles combine multiple base models to produce a more accurate output. They can be applied to a range of machine learning problems, including anomaly detection. In this article, we investigate how to maximize the composability and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "42", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shi:2023:DSE, author = "Zhengyuan Shi and Cheng Chen and Gangqiang Yang and Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo Wan", title = "Design Space Exploration of {Galois} and {Fibonacci} Configuration Based on {Espresso} Stream Cipher", journal = j-TRETS, volume = "16", number = "3", pages = "43:1--43:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567428", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/fibquart.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3567428", abstract = "Fibonacci and Galois are two different kinds of configurations in stream ciphers. Although many transformations between two configurations have been proposed, there is no sufficient analysis of their FPGA performance. Espresso stream cipher provides an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "43", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mao:2023:HPC, author = "Gaoyu Mao and Donglong Chen and Guangyan Li and Wangchen Dai and Abdurrashid Ibrahim Sanka and {\c{C}}etin Kaya Ko{\c{c}} and Ray C. C. Cheung", title = "High-performance and Configurable {SW\slash HW} Co-design of Post-quantum Signature {CRYSTALS-Dilithium}", journal = j-TRETS, volume = "16", number = "3", pages = "44:1--44:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569456", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3569456", abstract = "CRYSTALS-Dilithium is a lattice-based post-quantum digital signature scheme that is resistant to attacks by quantum computers and has been selected to be standardized in the NIST post-quantum cryptography (PQC) standardization process. However, the speed performance and design flexibility of the Dilithium still need to be evaluated. This article presents a high-performance software\slash hardware co-design of CRYSTALS-Dilithium based on the NIST PQC round-3 parameters. High-speed pipelined hardware modules for NTT\slash INTT, point-wise multiplication\slash addition, and for SHAKE are included in the design to accelerate the time-consuming operations in Dilithium. All hardware modules are parameterized, thus allowing full support of runtime configuration to increase versatility. Moreover, the proposed software\slash hardware architecture and tight operating workflows reduce the data transmission overhead between the processor and other hardware modules. The hardware accelerator is implemented with a reconfigurable logic on FPGA and is integrated with the high-performance ARM Cortex-A9 processor in the Xilinx Zynq Architecture. We measure the performance of the software\slash hardware system for Dilithium in NIST security levels 2, 3, and 5. Compared to pure software implementations, we achieve 8.7--12.5 times speedup in Key generation, 6.3--7.3 times speedup in Sign, and 9.1--12.2 times speedup in Verify operations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "44", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{He:2023:FIC, author = "Pengzhou He and Tianyou Bao and Jiafeng Xie and Moeness Amin", title = "{FPGA} Implementation of Compact Hardware Accelerators for Ring-Binary-{LWE}-based Post-quantum Cryptography", journal = j-TRETS, volume = "16", number = "3", pages = "45:1--45:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569457", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3569457", abstract = "Post-quantum cryptography (PQC) has recently drawn substantial attention from various communities owing to the proven vulnerability of existing public-key cryptosystems against the attacks launched from well-established quantum computers. The Ring-Binary-Learning-with-Errors (RBLWE), a variant of Ring-LWE, has been proposed to build PQC for lightweight applications. As more Field-Programmable Gate Array (FPGA) devices are being deployed in lightweight applications like Internet-of-Things (IoT) devices, it would be interesting if the RBLWE-based PQC can be implemented on the FPGA with ultra-low complexity and flexible processing. However, thus far, limited information is available for such implementations. In this article, we propose novel RBLWE-based PQC accelerators on the FPGA with ultra-low implementation complexity and flexible timing. We first present the process of deriving the key operation of the RBLWE-based scheme into the proposed algorithmic operation. The corresponding hardware accelerator is then efficiently mapped from the proposed algorithm with the help of algorithm-to-architecture implementation techniques and extended to obtain higher-throughput designs. The final complexity analysis and implementation results (on a variety of FPGAs) show that the proposed accelerators have significantly smaller area-time complexities than the state-of-the-art designs. Overall, the proposed accelerators feature low implementation complexity and flexible processing, making them desirable for emerging FPGA-based lightweight applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "45", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jun:2023:ASD, author = "Hyegang Jun and Hanchen Ye and Hyunmin Jeong and Deming Chen", title = "{AutoScaleDSE}: a Scalable Design Space Exploration Engine for High-Level Synthesis", journal = j-TRETS, volume = "16", number = "3", pages = "46:1--46:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572959", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3572959", abstract = "High-Level Synthesis (HLS) has enabled users to rapidly develop designs targeted for FPGAs from the behavioral description of the design. However, to synthesize an optimal design capable of taking better advantage of the target FPGA, a considerable amount \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "46", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chang:2023:AHC, author = "Liang Chang and Xin Zhao and Jun Zhou", title = "{ADAS}: a High Computational Utilization Dynamic Reconfigurable Hardware Accelerator for Super Resolution", journal = j-TRETS, volume = "16", number = "3", pages = "47:1--47:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570927", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3570927", abstract = "Super-resolution (SR) based on deep learning has obtained superior performance in image reconstruction. Recently, various algorithm efforts have been committed to improving image reconstruction quality and speed. However, the inference of SR contains huge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "47", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Skubich:2023:IRT, author = "Christian Skubich and Peter Reichel and Marc Reichenbach", title = "Increasing the Robustness of {TERO-TRNGs} Against Process Variation", journal = j-TRETS, volume = "16", number = "3", pages = "48:1--48:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597418", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3597418", abstract = "The transition effect ring oscillator is a popular design for building entropy sources because it is compact, built from digital elements only, and is very well suited for FPGAs. However, it is known to be quite sensitive to process variation. Although \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "48", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fiege:2023:BBS, author = "Nicolai Fiege and Peter Zipf", title = "{BLOOP}: {Boolean} Satisfiability-based Optimized Loop Pipelining", journal = j-TRETS, volume = "16", number = "3", pages = "49:1--49:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3599972", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3599972", abstract = "Modulo scheduling is the premier technique for throughput maximization of loops in high-level synthesis by interleaving consecutive loop iterations. The number of clock cycles between data insertions is called the initiation interval (II). For throughput \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "49", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Arora:2023:CDC, author = "Aman Arora and Atharva Bhamburkar and Aatman Borda and Tanmay Anand and Rishabh Sehgal and Bagus Hanindhito and Pierre-Emmanuel Gaillardon and Jaydeep Kulkarni and Lizy K. John", title = "{CoMeFa}: Deploying Compute-in-Memory on {FPGAs} for Deep Learning Acceleration", journal = j-TRETS, volume = "16", number = "3", pages = "50:1--50:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603504", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Aug 19 07:37:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3603504", abstract = "Block random access memories (BRAMs) are the storage houses of FPGAs, providing extensive on-chip memory bandwidth to the compute units implemented using logic blocks and digital signal processing slices. We propose modifying BRAMs to convert them to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "50", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2023:ISS, author = "Jing Li and Martin Herbordt", title = "Introduction to the Special Section on {FCCM 2022}", journal = j-TRETS, volume = "16", number = "4", pages = "51:1--51:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3632092", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3632092", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "51", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wu:2023:TEA, author = "Guiming Wu and Qianwen He and Jiali Jiang and Zhenxiang Zhang and Yuan Zhao and Yinchao Zou and Jie Zhang and Changzheng Wei and Ying Yan and Hui Zhang", title = "{Topgun}: an {ECC} Accelerator for Private Set Intersection", journal = j-TRETS, volume = "16", number = "4", pages = "52:1--52:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603114", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3603114", abstract = "Elliptic Curve Cryptography (ECC), one of the most widely used asymmetric cryptographic algorithms, has been deployed in Transport Layer Security (TLS) protocol, blockchain, secure multiparty computation, and so on. As one of the most secure ECC curves, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "52", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xu:2023:FAG, author = "Tiancheng Xu and Scott Rixner and Alan L. Cox", title = "An {FPGA} Accelerator for Genome Variant Calling", journal = j-TRETS, volume = "16", number = "4", pages = "53:1--53:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3595297", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3595297", abstract = "In genome analysis, it is often important to identify variants from a reference genome. However, identifying variants that occur with low frequency can be challenging, as it is computationally intensive to do so accurately. LoFreq is a widely used program \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "53", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Josipovic:2023:RSD, author = "Lana Josipovi{\'c} and Axel Marmet and Andrea Guerrieri and Paolo Ienne", title = "Resource Sharing in Dataflow Circuits", journal = j-TRETS, volume = "16", number = "4", pages = "54:1--54:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597614", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3597614", abstract = "To achieve resource-efficient hardware designs, high-level synthesis (HLS) tools share (i.e., time-multiplex) functional units among operations of the same type. This optimization is typically performed in conjunction with operation scheduling to ensure \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "54", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Cheng:2023:PCF, author = "Jianyi Cheng and Lana Josipovi{\'c} and John Wickerson and George A. Constantinides", title = "Parallelising Control Flow in Dynamic-scheduling High-level Synthesis", journal = j-TRETS, volume = "16", number = "4", pages = "55:1--55:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3599973", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3599973", abstract = "Recently, there is a trend to use high-level synthesis (HLS) tools to generate dynamically scheduled hardware. The generated hardware is made up of components connected using handshake signals. These handshake signals schedule the components at runtime \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "55", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ienne:2023:ISS, author = "Paolo Ienne", title = "Introduction to the Special Section on {FPGA 2022}", journal = j-TRETS, volume = "16", number = "4", pages = "56:1--56:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3618114", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3618114", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "56", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wang:2023:LSL, author = "Erwei Wang and Marie Auffret and Georgios-Ilias Stavrou and Peter Y. K. Cheung and George A. Constantinides and Mohamed S. Abdelfattah and James J. Davis", title = "Logic Shrinkage: Learned Connectivity Sparsification for {LUT}-Based Neural Networks", journal = j-TRETS, volume = "16", number = "4", pages = "57:1--57:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3583075", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3583075", abstract = "Field-programmable gate array (FPGA)-specific deep neural network (DNN) architectures using native lookup tables (LUTs) as independently trainable inference operators have been shown to achieve favorable area-accuracy and energy-accuracy trade-offs. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "57", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gao:2023:RAR, author = "Yizhao Gao and Song Wang and Hayden Kwok-Hay So", title = "A Reconfigurable Architecture for Real-time Event-based Multi-Object Tracking", journal = j-TRETS, volume = "16", number = "4", pages = "58:1--58:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3593587", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3593587", abstract = "Although advances in event-based machine vision algorithms have demonstrated unparalleled capabilities in performing some of the most demanding tasks, their implementations under stringent real-time and power constraints in edge systems remain a major \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "58", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Guo:2023:RAP, author = "Licheng Guo and Pongstorn Maidee and Yun Zhou and Chris Lavin and Eddie Hung and Wuxi Li and Jason Lau and Weikang Qiao and Yuze Chi and Linghao Song and Yuanlong Xiao and Alireza Kaviani and Zhiru Zhang and Jason Cong", title = "{RapidStream 2.0}: Automated Parallel Implementation of Latency-Insensitive {FPGA} Designs Through Partial Reconfiguration", journal = j-TRETS, volume = "16", number = "4", pages = "59:1--59:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3593025", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3593025", abstract = "Field-programmable gate arrays (FPGAs) require a much longer compilation cycle than conventional computing platforms such as CPUs. In this article, we shorten the overall compilation time by co-optimizing the HLS compilation (C-to-RTL) and the back-end \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "59", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nechi:2023:FBD, author = "Anouar Nechi and Lukas Groth and Saleh Mulhem and Farhad Merchant and Rainer Buchty and Mladen Berekovic", title = "{FPGA}-based Deep Learning Inference Accelerators: Where Are We Standing?", journal = j-TRETS, volume = "16", number = "4", pages = "60:1--60:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3613963", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3613963", abstract = "Recently, artificial intelligence applications have become part of almost all emerging technologies around us. Neural networks, in particular, have shown significant advantages and have been widely adopted over other approaches in machine learning. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "60", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leipnitz:2023:CAM, author = "Marcos T. Leipnitz and Gabriel L. Nazar", title = "Constraint-Aware Multi-Technique Approximate High-Level Synthesis for {FPGAs}", journal = j-TRETS, volume = "16", number = "4", pages = "61:1--61:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3624481", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624481", abstract = "Numerous approximate computing (AC) techniques have been developed to reduce the design costs in error-resilient application domains, such as signal and multimedia processing, data mining, machine learning, and computer vision, to trade-off computation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "61", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2023:CKC, author = "Kenneth Liu and Alec Lu and Kartik Samtani and Zhenman Fang and Licheng Guo", title = "{CHIP-KNNv2}: a Configurable and High-Performance {$K$}-Nearest Neighbors Accelerator on {HBM}-based {FPGAs}", journal = j-TRETS, volume = "16", number = "4", pages = "62:1--62:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3616873", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3616873", abstract = "The k-nearest neighbors (KNN) algorithm is an essential algorithm in many applications, such as similarity search, image classification, and database query. With the rapid growth in the dataset size and the feature dimension of each data point, processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "62", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Guo:2023:TST, author = "Licheng Guo and Yuze Chi and Jason Lau and Linghao Song and Xingyu Tian and Moazin Khatti and Weikang Qiao and Jie Wang and Ecenur Ustun and Zhenman Fang and Zhiru Zhang and Jason Cong", title = "{TAPA}: a Scalable Task-parallel Dataflow Programming Framework for Modern {FPGAs} with Co-optimization of {HLS} and Physical Design", journal = j-TRETS, volume = "16", number = "4", pages = "63:1--63:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3609335", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3609335", abstract = "In this article, we propose TAPA, an end-to-end framework that compiles a C++ task-parallel dataflow program into a high-frequency FPGA accelerator. Compared to existing solutions, TAPA has two major advantages. First, TAPA provides a set of convenient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "63", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2023:HET, author = "Yingchun Lu and Yun Yang and Rong Hu and Huaguo Liang and Maoxiang Yi and Huang Zhengfeng and Yuanming Ma and Tian Chen and Liang Yao", title = "High-efficiency {TRNG} Design Based on Multi-bit Dual-ring Oscillator", journal = j-TRETS, volume = "16", number = "4", pages = "64:1--64:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3624991", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Fri Dec 22 06:11:49 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624991", abstract = "Unpredictable true random numbers are required in security technology fields such as information encryption, key generation, mask generation for anti-side-channel analysis, algorithm initialization, and so on. At present, the true random number generator (TRNG) is not enough to provide fast random bits by low-speed bits generation. Therefore, it is necessary to design a faster TRNG. This work presents an ultra-compact TRNG with high throughput based on a novel extendable dual-ring oscillator (DRO). Owing to multiple bits output per cycle in DRO can be used to obtain the original random sequence, the proposed DRO achieves a maximum resource utilization to build a more efficient TRNG, compared with the conventional TRNG system based on ring oscillator (RO), which only has a single output and needs to build multiple groups of ring oscillators. TRNG based on the 2-bit DRO and its 8-bit derivative structure has been verified on Xilinx Artix-7 and Kintex-7 FPGA under the automatic layout and routing and has achieved a throughput of 550 Mbps and 1,100 Mbps, respectively. Moreover, in terms of throughput performance over operating frequency, hardware consumption, and entropy, the proposed scheme has obvious advantages. Finally, the generated sequences show good randomness in the test of NIST SP800-22 and Dieharder test suite and pass the entropy estimation test kit NIST SP800-90B and AIS-31.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "64", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Anupreetham:2024:HTF, author = "Anupreetham Anupreetham and Mohamed Ibrahim and Mathew Hall and Andrew Boutros and Ajay Kuzhively and Abinash Mohanty and Eriko Nurvitadhi and Vaughn Betz and Yu Cao and Jae-Sun Seo", title = "High Throughput {FPGA}-Based Object Detection via Algorithm-Hardware Co-Design", journal = j-TRETS, volume = "17", number = "1", pages = "1:1--1:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634919", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3634919", abstract = "Object detection and classification is a key task in many computer vision applications such as smart surveillance and autonomous vehicles. Recent advances in deep learning have significantly improved the quality of results achieved by these systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fan:2024:HDF, author = "Zimeng Fan and Wei Hu and Fang Liu and Dian Xu and Hong Guo and Yanxiang He and Min Peng", title = "A Hardware Design Framework for Computer Vision Models Based on Reconfigurable Devices", journal = j-TRETS, volume = "17", number = "1", pages = "2:1--2:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635157", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3635157", abstract = "In computer vision, the joint development of the algorithm and computing dimensions cannot be separated. Models and algorithms are constantly evolving, while hardware designs must adapt to new or updated algorithms. Reconfigurable devices are recognized \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Maschi:2024:SHS, author = "Fabio Maschi and Gustavo Alonso", title = "{Strega}: an {HTTP} Server for {FPGAs}", journal = j-TRETS, volume = "17", number = "1", pages = "3:1--3:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3611312", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3611312", abstract = "The computer architecture landscape is being reshaped by the new opportunities, challenges, and constraints brought by the cloud. On the one hand, high-level applications profit from specialised hardware to boost their performance and reduce deployment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Qiu:2024:FFD, author = "Yunhui Qiu and Yiqing Mao and Xuchen Gao and Sichao Chen and Jiangnan Li and Wenbo Yin and Lingli Wang", title = "{FDRA}: a Framework for a Dynamically Reconfigurable Accelerator Supporting Multi-Level Parallelism", journal = j-TRETS, volume = "17", number = "1", pages = "4:1--4:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3614224", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3614224", abstract = "Coarse-grained reconfigurable architectures (CGRAs) have emerged as promising accelerators due to their high flexibility and energy efficiency. However, existing open source works often lack integration of CGRAs with CPU systems and corresponding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kalomiros:2024:HAS, author = "John Kalomiros and John Vourvoulakis and Stavros Vologiannidis", title = "A Hardware Accelerator for the Semi-Global Matching Stereo Algorithm: an Efficient Implementation for the {Stratix V} and {Zynq UltraScale+} {FPGA} Technology", journal = j-TRETS, volume = "17", number = "1", pages = "5:1--5:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3615869", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3615869", abstract = "The semi-global matching stereo algorithm is a top performing algorithm in stereo vision. The recursive nature of the computations involved in this algorithm introduces an inherent data dependency problem, hindering the progressive computations of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Reis:2024:DDL, author = "Miguel Reis and M{\'a}rio V{\'e}stias and Hor{\'a}cio Neto", title = "Designing Deep Learning Models on {FPGA} with Multiple Heterogeneous Engines", journal = j-TRETS, volume = "17", number = "1", pages = "6:1--6:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3615870", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3615870", abstract = "Deep learning models are becoming more complex and heterogeneous with new layer types to improve their accuracy. This brings a considerable challenge to the designers of accelerators of deep neural networks. There have been several architectures and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{FaoDeMoura:2024:RNL, author = "Rafael {F{\~a}o De Moura} and Luigi Carro", title = "Reprogrammable Non-Linear Circuits Using {ReRAM} for {NN} Accelerators", journal = j-TRETS, volume = "17", number = "1", pages = "7:1--7:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617894", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3617894", abstract = "As the massive usage of artificial intelligence techniques spreads in the economy, researchers are exploring new techniques to reduce the energy consumption of Neural Network (NN) applications, especially as the complexity of NNs continues to increase. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Honorat:2024:ABS, author = "Alexandre Honorat and Micka{\"e}l Dardaillon and Hugo Miomandre and Jean-Fran{\c{c}}ois Nezan", title = "Automated Buffer Sizing of Dataflow Applications in a High-level Synthesis Workflow", journal = j-TRETS, volume = "17", number = "1", pages = "8:1--8:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626103", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3626103", abstract = "High-Level Synthesis (HLS) tools are mature enough to provide efficient code generation for computation kernels on FPGA hardware. For more complex applications, multiple kernels may be connected by a dataflow graph. Although some tools, such as Xilinx \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Noyez:2024:MMS, author = "Louis Noyez and Nadia {El Mrabet} and Olivier Potin and Pascal Veron", title = "{Montgomery} Multiplication Scalable Systolic Designs Optimized for {DSP48E2}", journal = j-TRETS, volume = "17", number = "1", pages = "9:1--9:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624571", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624571", abstract = "This article describes an extensive study of the use of DSP48E2 Slices in Ultrascale FPGAs to design hardware versions of the Montgomery Multiplication algorithm for the hardware acceleration of modular multiplications. Our fully scalable systolic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Soleimani:2024:PCA, author = "Parastoo Soleimani and David W. Capson and Kin Fun Li", title = "A Partitioned {CAM} Architecture with {FPGA} Acceleration for Binary Descriptor Matching", journal = j-TRETS, volume = "17", number = "1", pages = "10:1--10:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624749", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624749", abstract = "An efficient architecture for image descriptor matching that uses a partitioned content-addressable memory (CAM)-based approach is proposed. CAM is frequently used in high-speed content-matching applications. However, due to its lack of functionality to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Weng:2024:TAS, author = "Olivia Weng and Gabriel Marcano and Vladimir Loncar and Alireza Khodamoradi and Abarajithan G. and Nojan Sheybani and Andres Meza and Farinaz Koushanfar and Kristof Denolf and Javier Mauricio Duarte and Ryan Kastner", title = "{Tailor}: Altering Skip Connections for Resource-Efficient Inference", journal = j-TRETS, volume = "17", number = "1", pages = "11:1--11:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624990", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624990", abstract = "Deep neural networks use skip connections to improve training convergence. However, these skip connections are costly in hardware, requiring extra buffers and increasing on- and off-chip memory utilization and bandwidth requirements. In this article, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hasler:2024:PAS, author = "Jennifer Hasler and Cong Hao", title = "Programmable Analog System Benchmarks Leading to Efficient Analog Computation Synthesis", journal = j-TRETS, volume = "17", number = "1", pages = "12:1--12:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625298", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3625298", abstract = "This effort develops the first rich suite of analog and mixed-signal benchmark of various sizes and domains, intended for use with contemporary analog and mixed-signal designs and synthesis tools. Benchmarking enables analog-digital co-design exploration \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gohringer:2024:IFS, author = "Diana G{\"o}hringer and Georgios Keramidas and Akash Kumar", title = "Introduction to the {FPL 2021} Special Section", journal = j-TRETS, volume = "17", number = "1", pages = "13:1--13:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635115", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3635115", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nikolic:2024:EFS, author = "Stefan Nikoli{\'c} and Paolo Ienne", title = "Exploring {FPGA} Switch-Blocks without Explicitly Listing Connectivity Patterns", journal = j-TRETS, volume = "17", number = "1", pages = "14:1--14:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3597417", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3597417", abstract = "Increased lower metal resistance makes physical aspects of Field-Programmable Gate Array (FPGA) switch-blocks more relevant than before. The need to navigate a design space where each individual switch can have significant impact on the FPGA's performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2024:EFB, author = "Zhengyan Liu and Qiang Liu and Shun Yan and Ray C. C. Cheung", title = "An Efficient {FPGA}-based Depthwise Separable Convolutional Neural Network Accelerator with Hardware Pruning", journal = j-TRETS, volume = "17", number = "1", pages = "15:1--15:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3615661", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3615661", abstract = "Convolutional neural networks (CNNs) have been widely deployed in computer vision tasks. However, the computation and resource intensive characteristics of CNN bring obstacles to its application on embedded systems. This article proposes an efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2024:EVL, author = "Jeffrey Chen and Sang-Woo Jun and Sehwan Hong and Warrick He and Jinyeong Moon", title = "{Eciton}: Very Low-power Recurrent Neural Network Accelerator for Real-time Inference at the Edge", journal = j-TRETS, volume = "17", number = "1", pages = "16:1--16:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3629979", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3629979", abstract = "This article presents Eciton, a very low-power recurrent neural network accelerator for time series data within low-power edge sensor nodes, achieving real-time inference with a power consumption of 17 mW under load. Eciton reduces memory and chip \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sani:2024:EIU, author = "Sajjad Rostami Sani and Andy Ye", title = "Evaluating the Impact of Using Multiple-Metal Layers on the Layout Area of Switch Blocks for Tile-Based {FPGAs} in {FinFET} 7nm", journal = j-TRETS, volume = "17", number = "1", pages = "17:1--17:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639055", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3639055", abstract = "A new area model for estimating the layout area of switch blocks is introduced in this work. The model is based on a realistic layout strategy. As a result, it not only takes into consideration the active area that is needed to construct a switch block \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2024:ADC, author = "Yonggen Li and Xin Li and Haibin Shen and Jicong Fan and Yanfeng Xu and Kejie Huang", title = "An All-digital Compute-in-memory {FPGA} Architecture for Deep Learning Acceleration", journal = j-TRETS, volume = "17", number = "1", pages = "18:1--18:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640469", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Wed Mar 20 07:25:09 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3640469", abstract = "Field Programmable Gate Array (FPGA) is a versatile and programmable hardware platform, which makes it a promising candidate for accelerating Deep Neural Networks (DNNs). However, FPGA's computing energy efficiency is low due to the domination of energy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Koch:2024:ISI, author = "Andreas Koch and Kentaro Sano", title = "Introduction to the Special Issue on {FPL 2022}", journal = j-TRETS, volume = "17", number = "2", pages = "19:1--19:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3643474", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3643474", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jia:2024:XHP, author = "Xijie Jia and Yu Zhang and Guangdong Liu and Xinlin Yang and Tianyu Zhang and Jia Zheng and Dongdong Xu and Zhuohuan Liu and Mengke Liu and Xiaoyang Yan and Hong Wang and Rongzhang Zheng and Li Wang and Dong Li and Satyaprakash Pareek and Jian Weng and Lu Tian and Dongliang Xie and Hong Luo and Yi Shan", title = "{XVDPU}: a High-Performance {CNN} Accelerator on the Versal Platform Powered by the {AI} Engine", journal = j-TRETS, volume = "17", number = "2", pages = "20:1--20:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617836", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3617836", abstract = "Today, convolutional neural networks (CNNs) are widely used in computer vision applications. However, the trends of higher accuracy and higher resolution generate larger networks. The requirements of computation or I/O are the key bottlenecks. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xiao:2024:EEH, author = "Yuanlong Xiao and Dongjoon Park and Zeyu Jason Niu and Aditya Hota and Andr{\'e} Dehon", title = "{ExHiPR}: Extended High-Level Partial Reconfiguration for Fast Incremental {FPGA} Compilation", journal = j-TRETS, volume = "17", number = "2", pages = "21:1--21:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617837", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3617837", abstract = "Partial Reconfiguration (PR) is a key technique in the application design on modern FPGAs. However, current PR tools heavily rely on the developer to manually conduct PR module definition, floorplanning, and flow control at a low level. The existing PR \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Dann:2024:GSP, author = "Jonas Dann and Daniel Ritter and Holger Fr{\"o}ning", title = "{GraphScale}: Scalable Processing on {FPGAs} for {HBM} and Large Graphs", journal = j-TRETS, volume = "17", number = "2", pages = "22:1--22:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3616497", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3616497", abstract = "Recent advances in graph processing on FPGAs promise to alleviate performance bottlenecks with irregular memory access patterns. Such bottlenecks challenge performance for a growing number of important application areas like machine learning and data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Khan:2024:OSD, author = "Babar Khan and Carsten Heinz and Andreas Koch", title = "The Open-source {DeLiBA2} Hardware\slash Software Framework for Distributed Storage Accelerators", journal = j-TRETS, volume = "17", number = "2", pages = "23:1--23:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624482", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3624482", abstract = "With the trend towards ever larger ``big data'' applications, many of the gains achievable by using specialized compute accelerators become diminished due to the growing I/O overheads. While there have been several research efforts into computational \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Trautmann:2024:DCE, author = "Jens Trautmann and Paul Kr{\"u}ger and Andreas Becher and Stefan Wildermann and J{\"u}rgen Teich", title = "Design, Calibration, and Evaluation of Real-time Waveform Matching on an {FPGA}-based Digitizer at {10 GS/s}", journal = j-TRETS, volume = "17", number = "2", pages = "24:1--24:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635719", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3635719", abstract = "Digitizing side-channel signals at high sampling rates produces huge amounts of data, while side-channel analysis techniques only need those specific trace segments containing Cryptographic Operations (COs). For detecting these segments, waveform-matching \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yang:2024:HQO, author = "Geng Yang and Jie Lei and Zhenman Fang and Yunsong Li and Jiaqing Zhang and Weiying Xie", title = "{HyBNN}: Quantifying and Optimizing Hardware Efficiency of Binary Neural Networks", journal = j-TRETS, volume = "17", number = "2", pages = "25:1--25:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631610", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3631610", abstract = "Binary neural network (BNN), where both the weight and the activation values are represented with one bit, provides an attractive alternative to deploy highly efficient deep learning inference on resource-constrained edge devices. However, our \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Albartus:2024:MPX, author = "Nils Albartus and Maik Ender and Jan-Niklas M{\"o}ller and Marc Fyrbiak and Christof Paar and Russell Tessier", title = "On the Malicious Potential of {Xilinx}'s Internal Configuration Access Port {(ICAP)}", journal = j-TRETS, volume = "17", number = "2", pages = "26:1--26:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633204", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3633204", abstract = "Field Programmable Gate Arrays (FPGAs) have become increasingly popular in computing platforms. With recent advances in bitstream format reverse engineering, the scientific community has widely explored static FPGA security threats. For example, it is now \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Trochatos:2024:CCF, author = "Theodoros Trochatos and Anthony Etim and Jakub Szefer", title = "Covert-channels in {FPGA}-enabled {SmartSSDs}", journal = j-TRETS, volume = "17", number = "2", pages = "27:1--27:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635312", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3635312", abstract = "Cloud computing providers today offer access to a variety of devices, which users can rent and access remotely in a shared setting. Among these devices are SmartSSDs, which are solid-state disks (SSD) augmented with an FPGA, enabling users to instantiate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{DelSozzo:2024:ATS, author = "Emanuele {Del Sozzo} and Davide Conficconi and Kentaro Sano", title = "Across Time and Space: {Senju}'s Approach for Scaling Iterative Stencil Loop Accelerators on Single and Multiple {FPGAs}", journal = j-TRETS, volume = "17", number = "2", pages = "28:1--28:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634920", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3634920", abstract = "Stencil-based applications play an essential role in high-performance systems as they occur in numerous computational areas, such as partial differential equation solving. In this context, Iterative Stencil Loops (ISLs) represent a prominent and well-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bao:2024:AFI, author = "Tianyou Bao and Pengzhou He and Jiafeng Xie and H. S. Jacinto", title = "{AEKA}: {FPGA} Implementation of Area-Efficient {Karatsuba} Accelerator for Ring-Binary-{LWE}-Based Lightweight {PQC}", journal = j-TRETS, volume = "17", number = "2", pages = "29:1--29:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637215", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3637215", abstract = "Lightweight PQC-related research and development have gradually gained attention from the research community recently. Ring-Binary-Learning-with-Errors (RBLWE)-based encryption scheme (RBLWE-ENC), a promising lightweight PQC based on small parameter sets to fit related applications (but not in favor of deploying popular fast algorithms like number theoretic transform). To solve this problem, in this article, we present a novel implementation of hardware acceleration for RBLWE-ENC based on Karatsuba algorithm, particularly on the field-programmable gate array (FPGA) platform. In detail, we have proposed an area-efficient Karatsuba Accelerator (AEKA) for RBLWE-ENC, based on three layers of innovative efforts. First of all, we reformulate the signal processing sequence within the major arithmetic component of the KA-based polynomial multiplication for RBLWE-ENC to obtain a new algorithm. Then, we have designed the proposed algorithm into a new hardware accelerator with several novel algorithm-to-architecture mapping techniques. Finally, we have conducted thorough complexity analysis and comparison to demonstrate the efficiency of the proposed accelerator, e.g., it involves 62.5\% higher throughput and 60.2\% less area-delay product (ADP) than the state-of-the-art design for $ n = 512 $ (Virtex-7 device, similar setup). The proposed AEKA design strategy is highly efficient on the FPGA devices, i.e., small resource usage with superior timing, which can be integrated with other necessary systems for lightweight-oriented high-performance applications (e.g., servers). The outcome of this work is also expected to generate impacts for lightweight PQC advancement.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hossfeld:2024:HEC, author = "Konstantin Ho{\ss}feld and Hans Jakob Damsgaard and Jar Nurmi and Michaela Blott and Thomas B. Preu{\ss}er", title = "High-efficiency Compressor Trees for Latest {AMD FPGAs}", journal = j-TRETS, volume = "17", number = "2", pages = "30:1--30:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3645097", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3645097", abstract = "High-fan-in dot product computations are ubiquitous in highly relevant application domains, such as signal processing and machine learning. Particularly, the diverse set of data formats used in machine learning poses a challenge for flexible efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sahoo:2024:ADF, author = "Siva Satyendra Sahoo and Salim Ullah and Akash Kumar", title = "{AxOMaP}: Designing {FPGA}-based Approximate Arithmetic Operators using Mathematical Programming", journal = j-TRETS, volume = "17", number = "2", pages = "31:1--31:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3648694", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3648694", abstract = "With the increasing application of machine learning (ML) algorithms in embedded systems, there is a rising necessity to design low-cost computer arithmetic for these resource-constrained systems. As a result, emerging models of computation, such as approximate and stochastic computing, that leverage the inherent error-resilience of such algorithms are being actively explored for implementing ML inference on resource-constrained systems. Approximate computing (AxC) aims to provide disproportionate gains in the power, performance, and area (PPA) of an application by allowing some level of reduction in its behavioral accuracy (BEHAV). Using approximate operators (AxOs) for computer arithmetic forms one of the more prevalent methods of implementing AxC. AxOs provide the additional scope for finer granularity of optimization, compared to only precision scaling of computer arithmetic. To this end, the design of platform-specific and cost-efficient approximate operators forms an important research goal. Recently, multiple works have reported the use of AI\slash ML-based approaches for synthesizing novel FPGA-based AxOs. However, most of such works limit the use of AI/ML to designing ML-based surrogate functions that are used during iterative optimization processes. To this end, we propose a novel data analysis-driven mathematical programming-based approach to synthesizing approximate operators for FPGAs. Specifically, we formulate mixed integer quadratically constrained programs based on the results of correlation analysis of the characterization data and use the solutions to enable a more directed search approach for evolutionary optimization algorithms. Compared to traditional evolutionary algorithms-based optimization, we report up to 21\% improvement in the hypervolume, for joint optimization of PPA and BEHAV, in the design of signed 8-bit multipliers. Further, we report up to 27\% better hypervolume than other state-of-the-art approaches to DSE for FPGA-based application-specific AxOs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2024:SHP, author = "Kexin Li and Shaoxian Xu and Zhiyuan Shao and Ran Zheng and Xiaofei Liao and Hai Jin", title = "{ScalaBFS2}: a High-performance {BFS} Accelerator on an {HBM}-enhanced {FPGA} Chip", journal = j-TRETS, volume = "17", number = "2", pages = "32:1--32:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650037", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3650037", abstract = "The introduction of High Bandwidth Memory (HBM) to the FPGA chip makes it possible for an FPGA-based accelerator to leverage the huge memory bandwidth of HBM to improve its performance when implementing a specific algorithm, which is especially true for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "32", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Keilbart:2024:DIC, author = "Chris Keilbart and Yuhui Gao and Martin Chua and Eric Matthews and Steven J. E. Wilton and Lesley Shannon", title = "Designing an {IEEE}-Compliant {FPU} that Supports Configurable Precision for Soft Processors", journal = j-TRETS, volume = "17", number = "2", pages = "33:1--33:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650036", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3650036", abstract = "Field Programmable Gate Arrays (FPGAs) are commonly used to accelerate floating-point (FP) applications. Although researchers have extensively studied FPGA FP implementations, existing work has largely focused on standalone operators and frequency-optimized designs. These works are not suitable for FPGA soft processors which are more sensitive to latency, impose a lower frequency ceiling, and require IEEE FP standard compliance. We present an open-source floating-point unit (FPU) for FPGA RISC-V soft processors that is fully IEEE compliant with configurable levels of FP precision. Our design emphasizes runtime performance with 25% lower latency in the most common instructions compared to previous works while maintaining efficient resource utilization.\par Our FPU also allows users to explore various mantissa widths without having to rewrite or recompile their algorithms. We use this to investigate the scalability of our reduced-precision FPU across numerous microbenchmark functions as well as more complex case studies. Our experiments show that applications like the discrete cosine transformation and the Black--Scholes model can realize a speedup of more than 1.35x in conjunction with a 43% and 35% reduction in lookup table and flip-flop resources while experiencing less than a 0.025\% average loss in numerical accuracy with a 16-bit mantissa width.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "33", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{deBruin:2024:RBE, author = "Barry de Bruin and Kanishkan Vadivel and Mark Wijtvliet and Pekka J{\"a}{\"a}skel{\"a}inen and Henk Corporaal", title = "{R-Blocks}: an Energy-Efficient, Flexible, and Programmable {CGRA}", journal = j-TRETS, volume = "17", number = "2", pages = "34:1--34:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656642", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3656642", abstract = "Emerging data-driven applications in the embedded, e-Health, and internet of things (IoT) domain require complex on-device signal analysis and data reduction to maximize energy efficiency on these energy-constrained devices. Coarse-grained reconfigurable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "34", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2024:HNF, author = "Sichao Chen and Chang Cai and Su Zheng and Jiangnan Li and Guowei Zhu and Jingyuan Li and Yazhou Yan and Yuan Dai and Wenbo Yin and Lingli Wang", title = "{HierCGRA}: a Novel Framework for Large-scale {CGRA} with Hierarchical Modeling and Automated Design Space Exploration", journal = j-TRETS, volume = "17", number = "2", pages = "35:1--35:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656176", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Jun 4 06:09:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3656176", abstract = "Coarse-grained reconfigurable arrays (CGRAs) are promising design choices in computation-intensive domains, since they can strike a balance between energy efficiency and flexibility. A typical CGRA comprises processing elements (PEs) that can execute \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "35", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Campos:2024:EEC, author = "Javier Campos and Jovan Mitrevski and Nhan Tran and Zhen Dong and Amir Gholaminejad and Michael W. Mahoney and Javier Duarte", title = "End-to-end codesign of {Hessian}-aware quantized neural networks for {FPGAs}", journal = j-TRETS, volume = "17", number = "3", pages = "36:1--36:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3662000", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3662000", abstract = "We develop an end-to-end workflow for the training and implementation of co-designed neural networks (NNs) for efficient field-programmable gate array (FPGA) hardware. Our approach leverages Hessian-aware quantization of NNs, the Quantized Open Neural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "36", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Haase:2024:NLE, author = "Julian Haase and Najdet Charaf and Alexander Gro{\ss} and Diana G{\"o}hringer", title = "{NC-Library}: Expanding {SystemC} Capabilities for Nested {reConfigurable} Hardware Modelling", journal = j-TRETS, volume = "17", number = "3", pages = "37:1--37:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3662001", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3662001", abstract = "As runtime reconfiguration is used in an increasing number of hardware architectures, new simulation and modeling tools are needed to support the developer during the design phases. In this article, a language extension for SystemC is presented, together \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "37", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xu:2024:ESA, author = "Shiyao Xu and Jingfei Jiang and Jinwei Xu and Xifu Qian", title = "Efficient {SpMM} Accelerator for Deep Learning: {Sparkle} and Its Automated Generator", journal = j-TRETS, volume = "17", number = "3", pages = "38:1--38:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665896", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3665896", abstract = "Deep learning (DL) technology has made breakthroughs in a wide range of intelligent tasks, such as vision, language, recommendation systems, and so on. Sparse matrix multiplication (SpMM) is the key computation kernel of most sparse models. Conventional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "38", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2024:SAA, author = "Alec Lu and Jahanvi Narendra Agrawal and Zhenman Fang", title = "{SQL2FPGA}: Automated Acceleration of {SQL} Query Processing on Modern {CPU-FPGA} Platforms", journal = j-TRETS, volume = "17", number = "3", pages = "39:1--39:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674843", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3674843", abstract = "Today's big data query engines are constantly under pressure to keep up with the rapidly increasing demand for faster processing of more complex workloads. In the past few years, FPGA-based database acceleration efforts have demonstrated promising \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "39", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hirtum:2024:CND, author = "Lennart {Van Hirtum} and Patrick {De Causmaecker} and Jens Goemaere and Tobias Kenter and Heinrich Riebler and Michael Lass and Christian Plessl", title = "A Computation of the Ninth {Dedekind} Number Using {FPGA} Supercomputing", journal = j-TRETS, volume = "17", number = "3", pages = "40:1--40:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674147", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3674147", abstract = "This manuscript makes the claim of having computed the \(9\)th Dedekind number, D(9). This was done by accelerating the core operation of the process with an efficient FPGA design that outperforms an optimized 64-core CPU reference by 95 \(\times\). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "40", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Carril:2024:HAH, author = "Xavier Carril and Charalampos Kardaris and Jordi Ribes-Gonz{\'a}Lez and Oriol Farr{\`a}s and Carles Hernandez and Vatistas Kostalabros and Joel Ulises Gonz{\'a}lez-Jim{\'e}nez and Miquel Moret{\'o}", title = "Hardware Acceleration for High-Volume Operations of {CRYSTALS-Kyber} and {CRYSTALS-Dilithium}", journal = j-TRETS, volume = "17", number = "3", pages = "41:1--41:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3675172", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3675172", abstract = "Many high-demand digital services need to perform several cryptographic operations, such as key exchange or security credentialing, in a concise amount of time. In turn, the security of some of these cryptographic schemes is threatened by advances in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "41", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Khatti:2024:PPA, author = "Moazin Khatti and Xingyu Tian and Ahmad Sedigh Baroughi and Akhil Raj Baranwal and Yuze Chi and Licheng Guo and Jason Cong and Zhenman Fang", title = "{PASTA}: Programming and Automation Support for Scalable Task-Parallel {HLS} Programs on Modern Multi-Die {FPGAs}", journal = j-TRETS, volume = "17", number = "3", pages = "42:1--42:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3676849", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3676849", abstract = "In recent years, the adoption of FPGAs in datacenters has increased, with a growing number of users choosing High-Level Synthesis (HLS) as their preferred programming method. While HLS simplifies FPGA programming, one notable challenge arises when scaling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "42", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fahmy:2024:ISS, author = "Suhaib A. Fahmy and Jason D. Bakos", title = "Introduction to the Special Section on {FPGA 2023}", journal = j-TRETS, volume = "17", number = "3", pages = "43:1--43:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695841", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3695841", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "43", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gribok:2024:CCP, author = "Sergey Gribok and Bogdan Pasca and Martin Langhammer", title = "{CSAIL2019} Crypto-Puzzle Solver Architecture", journal = j-TRETS, volume = "17", number = "3", pages = "44:1--44:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639056", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3639056", abstract = "The CSAIL2019 time-lock puzzle is an unsolved cryptographic challenge introduced by Ron Rivest in 2019, replacing the solved LCS35 puzzle. Solving these types of puzzles requires large amounts of intrinsically sequential computations, with each iteration \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "44", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wong:2024:DDF, author = "Linus Y. Wong and Jialiang Zhang and Jing Li", title = "{DONGLE 2.0}: Direct {FPGA}-Orchestrated {NVMe} Storage for {HLS}", journal = j-TRETS, volume = "17", number = "3", pages = "45:1--45:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650038", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3650038", abstract = "Rapid growth in data size poses significant computational and memory challenges to data processing. FPGA accelerators and near-storage processing have emerged as compelling solutions for tackling the growing computational and memory requirements. Many \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "45", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2024:FAL, author = "Chaoqiang Liu and Xiaofei Liao and Long Zheng and Yu Huang and Haifeng Liu and Yi Zhang and Haiheng He and Haoyan Huang and Jingyi Zhou and Hai Jin", title = "{L-FNNG}: Accelerating Large-Scale {KNN} Graph Construction on {CPU--FPGA} Heterogeneous Platform", journal = j-TRETS, volume = "17", number = "3", pages = "46:1--46:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652609", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3652609", abstract = "Due to the high complexity of constructing exact k -nearest neighbor graphs, approximate construction has become a popular research topic. The NN-Descent algorithm is one of the representative in-memory algorithms. To effectively handle large datasets, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "46", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Du:2024:FFA, author = "Linfeng Du and Tingyuan Liang and Xiaofeng Zhou and Jinming Ge and Shangkun Li and Sharad Sinha and Jieru Zhao and Zhiyao Xie and Wei Zhang", title = "{FADO}: Floorplan-Aware Directive Optimization Based on Synthesis and Analytical Models for High-Level Synthesis Designs on Multi-Die {FPGAs}", journal = j-TRETS, volume = "17", number = "3", pages = "47:1--47:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653458", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3653458", abstract = "Multi-die FPGAs are widely adopted for large-scale accelerators, but optimizing high-level synthesis designs on these FPGAs faces two challenges. First, the delay caused by die-crossing nets creates an NP-hard floorplanning problem. Second, traditional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "47", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jaiyeoba:2024:DAD, author = "Oluwole Jaiyeoba and Kevin Skadron", title = "{Dynamic-ACTS} --- A Dynamic Graph Analytics Accelerator For {HBM}-Enabled {FPGAs}", journal = j-TRETS, volume = "17", number = "3", pages = "48:1--48:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3662002", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3662002", abstract = "Graph processing frameworks suffer performance degradation from under-utilization of available memory bandwidth, because graph traversal often exhibits poor locality. A prior work, ACTS [ 24 ], accelerates graph processing with FPGAs and High Bandwidth \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "48", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Drewes:2024:TTL, author = "Colin Drewes and Tyler Sheaves and Olivia Weng and Keegan Ryan and Bill Hunter and Christopher McCarty and Ryan Kastner and Dustin Richmond", title = "Turn on, Tune in, and Listen up: Maximizing Side-Channel Recovery in Cross-Platform Time-to-Digital Converters", journal = j-TRETS, volume = "17", number = "3", pages = "49:1--49:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3666092", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3666092", abstract = "Voltage fluctuation sensors measure minute changes in an FPGA power distribution network, allowing attackers to extract information from concurrently executing computations. Previous voltage fluctuation sensors make assumptions about the co-tenant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "49", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wilson:2024:IFT, author = "Andrew Elbert Wilson and Nathan Baker and Ethan Campbell and Michael Wirthlin", title = "Improving Fault Tolerance for {FPGA SoCs} through Post-Radiation Design Analysis", journal = j-TRETS, volume = "17", number = "3", pages = "50:1--50:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674841", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3674841", abstract = "FPGAs have been shown to operate reliably within harsh radiation environments by employing single-event upset (SEU) mitigation techniques, such as configuration scrubbing, triple-modular redundancy, error correction coding, and radiation aware \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "50", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhuang:2024:CCH, author = "Jinming Zhuang and Jason Lau and Hanchen Ye and Zhuoping Yang and Shixin Ji and Jack Lo and Kristof Denolf and Stephen Neuendorffer and Alex Jones and Jingtong Hu and Yiyu Shi and Deming Chen and Jason Cong and Peipei Zhou", title = "{CHARM 2.0}: Composing Heterogeneous Accelerators for Deep Learning on Versal {ACAP} Architecture", journal = j-TRETS, volume = "17", number = "3", pages = "51:1--51:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3686163", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Oct 1 11:41:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3686163", abstract = "Dense matrix multiply (MM) serves as one of the most heavily used kernels in deep learning applications. To cope with the high computation demands of these applications, heterogeneous architectures featuring both FPGA and dedicated ASIC accelerators have \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "51", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Singh:2024:ISI, author = "Satwant Singh and Carlos E. M. Marin and Yun (Eric) Liang and Yao Chen and Nele Mentens and Raymond Nijssen", title = "Introduction to the Special Issue on {FPGA}-based Embedded Systems for Industrial and {IoT} Applications", journal = j-TRETS, volume = "17", number = "4", pages = "52:1--52:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3698202", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3698202", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "52", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2024:PPG, author = "Guangyan Li and Zewen Ye and Donglong Chen and Wangchen Dai and Gaoyu Mao and Kejie Huang and Ray C. C. Cheung", title = "{ProgramGalois}: a Programmable Generator of Radix-4 Discrete {Galois} Transformation Architecture for Lattice-Based Cryptography", journal = j-TRETS, volume = "17", number = "4", pages = "53:1--53:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689437", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3689437", abstract = "Lattice-based cryptography (LBC) has been established as a prominent research field, with particular attention on post-quantum cryptography (PQC) and fully homomorphic encryption (FHE). As the implementing bottleneck of PQC and FHE, number theoretic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "53", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lopez-Valdivieso:2024:DIH, author = "Jonathan Lopez-Valdivieso and Rene Cumplido", title = "Design and Implementation of Hardware--Software Architecture Based on Hashes for {SPHINCS+}", journal = j-TRETS, volume = "17", number = "4", pages = "54:1--54:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653459", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3653459", abstract = "Advances in quantum computing have posed a future threat to today's cryptography. With the advent of these quantum computers, security could be compromised. Therefore, the National Institute of Standards and Technology (NIST) has issued a request for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "54", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Jellum:2024:CRO, author = "Erling Rennemo Jellum and Martin Schoeberl and Edward Ashford Lee and Milica Orlandic", title = "Codesign of Reactor-Oriented Hardware and Software for Cyber-Physical Systems", journal = j-TRETS, volume = "17", number = "4", pages = "55:1--55:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672083", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3672083", abstract = "Modern cyber-physical systems often make use of heterogeneous systems-on-chip with reconfigurable logic to provide adequate computing power and flexible I/O. However, modeling, verifying, and implementing the computations spanning CPUs and reconfigurable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "55", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gajjar:2024:RFR, author = "Archit Gajjar and Priyank Kashyap and Aydin Aysu and Paul Franzon and Yongjin Choi and Chris Cheng and Giacomo Pedretti and Jim Ignowski", title = "{RD-FAXID}: Ransomware Detection with {FPGA}-Accelerated {XGBoost}", journal = j-TRETS, volume = "17", number = "4", pages = "56:1--56:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688396", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3688396", abstract = "Over the last decade, there has been a rise in cyberattacks, particularly ransomware, causing significant disruption and financial repercussions across public and private sectors. Tremendous efforts have been spent on developing techniques to detect \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "56", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Li:2024:FRT, author = "Yuqi Li and Kehao Zhao and Jieru Zhao and Qirui Wang and Shuda Zhong and Nageswara Lalam and Ruishu Wright and Peipei Zhou and Kevin P. Chen", title = "{FiberFlex}: Real-time {FPGA}-based Intelligent and Distributed Fiber Sensor System for Pedestrian Recognition", journal = j-TRETS, volume = "17", number = "4", pages = "57:1--57:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690389", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3690389", abstract = "In recent years, security monitoring of public places and critical infrastructure has heavily relied on the widespread use of cameras, raising concerns about personal privacy violations. To balance the need for effective security monitoring with the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "57", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tang:2024:GOH, author = "Enhao Tang and Shun Li and Ruiqi Chen and Hao Zhou and Yuhanxiao Ma and Haoyang Zhang and Jun Yu and Kun Wang", title = "{Graph-OPU}: a Highly Flexible {FPGA-Based} Overlay Processor for Graph Neural Networks", journal = j-TRETS, volume = "17", number = "4", pages = "58:1--58:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3691636", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3691636", abstract = "Field-programmable gate arrays (FPGAs) are an ideal candidate for accelerating graph neural networks (GNNs). However, the FPGA redeployment process is time-consuming when updating or switching between diverse GNN models across different applications. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "58", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liu:2024:FBS, author = "Yajing Liu and Ruiqi Chen and Shuyang Li and Jing Yang and Shun Li and Bruno da Silva", title = "{FPGA}-Based Sparse Matrix Multiplication Accelerators: From State-of-the-Art to Future Opportunities", journal = j-TRETS, volume = "17", number = "4", pages = "59:1--59:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3687480", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3687480", abstract = "Sparse matrix multiplication (SpMM) plays a critical role in high-performance computing applications, such as deep learning, image processing, and physical simulation. Field-Programmable Gate Arrays (FPGAs), with their configurable hardware resources, can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "59", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wu:2024:FAI, author = "Zi-Ming Wu and Meng-Yuan Zhao and Bin Yan and Jeng-Shyang Pan and Hong-Mei Yang", title = "{FPGA} Accelerated Implementation of {$3$D} Mesh Secret Sharing Based on Symmetric Similarity of Model", journal = j-TRETS, volume = "17", number = "4", pages = "60:1--60:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689049", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3689049", abstract = "Secret sharing is particularly important in the field of information security, which allows for the reconstruction of secret information from secure shares. However, due to the large amount of data and non-integer data type of 3D (three-dimensional) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "60", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2024:CTO, author = "Kuan-Yu Chen and Thomas Mason Nelson and Alireza Khadem and Morteza Fayazi and Sanjay Sri Vallabh Singapuram and Ronald Dreslinski and Nishil Talati and Hun-Seok Kim and David Blaauw", title = "{Canalis}: a Throughput-Optimized Framework for Real-Time Stream Processing of Wireless Communication", journal = j-TRETS, volume = "17", number = "4", pages = "61:1--61:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695880", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3695880", abstract = "Stream processing, which involves real-time computation of data as it is created or received, is vital for various applications, specifically wireless communication. The evolving protocols, the requirement for high-throughput, and the challenges of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "61", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ifrim:2024:SRF, author = "Rares Ifrim and Dumitrel Loghin and Decebal Popescu", title = "A Systematic Review of Fast, Scalable, and Efficient Hardware Implementations of Elliptic Curve Cryptography for Blockchain", journal = j-TRETS, volume = "17", number = "4", pages = "62:1--62:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696422", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Nov 23 13:27:04 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3696422", abstract = "Blockchain technology entered the enterprise domain under the name of permissioned blockchains and hybrid or verifiable database systems, as they provide a distributed solution that allows multiple distrusting parties to share common information. One \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "62", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hutchings:2025:TFI, author = "Daniel Hutchings and Adam Taylor and Jeffrey Goeders", title = "Toward {FPGA} Intellectual Property Encryption from Netlist to Bitstream", journal = j-TRETS, volume = "18", number = "1", pages = "1:1--1:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3656644", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3656644", abstract = "Current intellectual property (IP) encryption methods offered by field-programmable gate array (FPGA) vendors use an approach where the IP is decrypted during the computer-aided design (CAD) flow and remains unencrypted in the bitstream. Given the ease of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhao:2025:HDP, author = "Chenfeng Zhao and Clayton Faber and Roger Chamberlain and Xuan Zhang", title = "{HLPerf}: Demystifying the Performance of {HLS}-based Graph Neural Networks with Dataflow Architectures", journal = j-TRETS, volume = "18", number = "1", pages = "2:1--2:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3655627", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3655627", abstract = "The development of FPGA-based applications using HLS is fraught with performance pitfalls and large design space exploration times. These issues are exacerbated when the application is complicated and its performance is dependent on the input dataset, as \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sun:2025:PRE, author = "Mingqian Sun and Guangwei Xie and Fan Zhang and Wei Guo and Xitian Fan and Tianyang Li and Li Chen and Jiayu Du", title = "{PTME}: a Regular Expression Matching Engine Based on Speculation and Enumerative Computation on {FPGA}", journal = j-TRETS, volume = "18", number = "1", pages = "3:1--3:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3655626", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3655626", abstract = "Fast regular expression matching is an essential task for deep packet inspection. In previous works, the regular expression matching engine on FPGA struggled to achieve an ideal balance between resource consumption and throughput. Speculation and numerative computation exploits the statistical properties of deterministic finite automata, allowing for more efficient pattern matching. Existing related designs mostly revolve around vector instructions and multiple processors/cores or SIMD instruction sets, with a lack of implementation on FPGA platforms. We design a parallelized two-character matching engine on FPGA for efficiently fast filtering off fields with no pattern features. We transform the state transitions with sequential dependencies to the existing problem of elements in one set, enabling the proposed design to achieve high throughput with low resource consumption and support dynamic updates. Results show that compared with the traditional DFA matching, with a maximum resource consumption of 25\% for on-chip FFs (74323/1045440) and LUTs (123902/522720), there is an improvement in throughput of 8.08--229.96$ \times $ speedup and 87.61--99.56\% speed-up(percentage improvement) for normal traffic, and 11.73--39.59$ \times $ speedup and 91.47--97.47\% speed-up(percentage improvement) for traffic with high-frequency match hits. Compared with the state-of-the-art similar implementation, our circuit on a single FPGA chip is superior to existing multi-core designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tamimi:2025:DDA, author = "Sajjad Tamimi and Arthur Bernhardt and Florian Stock and Ilia Petrov and Andreas Koch", title = "{DANSEN}: Database Acceleration on Native Computational Storage by Exploiting {NDP}", journal = j-TRETS, volume = "18", number = "1", pages = "4:1--4:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3655625", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3655625", abstract = "This article introduces DANSEN, the hardware accelerator component for neoDBMS, a full-stack computational storage system designed to manage on-device execution of database queries/transactions as a Near-Data Processing (NDP)-operation. The proposed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Chen:2025:UPF, author = "Hongzheng Chen and Jiahao Zhang and Yixiao Du and Shaojie Xiang and Zichao Yue and Niansong Zhang and Yaohui Cai and Zhiru Zhang", title = "Understanding the Potential of {FPGA}-based Spatial Acceleration for Large Language Model Inference", journal = j-TRETS, volume = "18", number = "1", pages = "5:1--5:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3656177", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3656177", abstract = "Recent advancements in large language models (LLMs) boasting billions of parameters have generated a significant demand for efficient deployment in inference workloads. While hardware accelerators for Transformer-based models have been extensively studied,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abouelhamayed:2025:PEP, author = "Ahmed Abouelhamayed and Angela Cui and Javier Fernandez-marques and Nicholas Lane and Mohamed Abdelfattah", title = "{PQA}: Exploring the Potential of Product Quantization in {DNN} Hardware Acceleration", journal = j-TRETS, volume = "18", number = "1", pages = "6:1--6:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3656643", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3656643", abstract = "Conventional multiply-accumulate (MAC) operations have long dominated computation time for deep neural networks (DNNs), especially convolutional neural networks (CNNs). Recently, product quantization (PQ) has been applied to these workloads, replacing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Vakili:2025:DFL, author = "Shervin Vakili and Mobin Vaziri and Amirhossein Zarei and J. M. Pierre Langlois", title = "{DyRecMul}: Fast and Low-Cost Approximate Multiplier for {FPGAs} using Dynamic Reconfiguration", journal = j-TRETS, volume = "18", number = "1", pages = "7:1--7:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3663480", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3663480", abstract = "Multipliers are widely-used arithmetic operators in digital signal processing and machine learning (ML) circuits. Due to their relatively high complexity, they can have high latency and be a significant source of power consumption. One strategy to alleviate these limitations is to use approximate computing. This article thus introduces an original FPGA-based approximate multiplier specifically optimized for ML computations. It utilizes dynamically reconfigurable lookup table (LUT) primitives in AMD-Xilinx technology to realize the core part of the computations. The article provides an in-depth analysis of the hardware architecture, implementation outcomes, and accuracy evaluations of the multiplier proposed in INT8 precision. The article also facilitates the generalization of the proposed approximate multiplier idea to other datatypes, providing analysis and estimations for hardware cost and accuracy as a function of multiplier parameters. Implementation results on an AMD-Xilinx Kintex Ultrascale+ FPGA demonstrate remarkable savings of 64\% and 67\% in LUT utilization for signed multiplication and multiply-and-accumulation configurations, respectively when compared to the standard Xilinx multiplier core. Accuracy measurements on four popular deep learning (DL) benchmarks indicate a minimal average accuracy decrease of less than 0.29\% during post-training deployment, with the maximum reduction staying less than 0.33\%. The source code of this work is available on GitHub.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Miyagi:2025:SAL, author = "Ryota Miyagi and Ryota Yasudo and Kentaro Sano and Hideki Takase", title = "A Scalable Accelerator for Local Score Computation of Structure Learning in {Bayesian} Networks", journal = j-TRETS, volume = "18", number = "1", pages = "8:1--8:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3674842", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3674842", abstract = "A Bayesian network is a powerful tool for representing uncertainty in data, offering transparent and interpretable inference, unlike neural networks' black-box mechanisms. To fully harness the potential of Bayesian networks, it is essential to learn the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gozzi:2025:SHL, author = "Giovanni Gozzi and Michele Fiorito and Serena Curzel and Claudio Barone and Vito Giovanni Castellana and Marco Minutoli and Antonino Tumeo and Fabrizio Ferrandi", title = "{SPARTA}: High-Level Synthesis of Parallel Multi-Threaded Accelerators", journal = j-TRETS, volume = "18", number = "1", pages = "9:1--9:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3677035", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3677035", abstract = "This article presents a methodology for the Synthesis of PARallel multi-Threaded Accelerators (SPARTA) from OpenMP annotated C/C++ specifications. SPARTA extends an open-source HLS tool, enabling the generation of accelerators that provide latency \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Nickel:2025:SAH, author = "Matthias Nickel and Diana G{\"o}hringer", title = "A Survey on Architectures, Hardware Acceleration and Challenges for In-Network Computing", journal = j-TRETS, volume = "18", number = "1", pages = "10:1--10:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3699514", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3699514", abstract = "By moving data and computation away from the end user to more powerful servers in the cloud or to cloudlets at the edge, end user devices only need to compute locally for small amounts of data and when low latency is required. However, with the advent of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ferikoglou:2025:CCA, author = "Aggelos Ferikoglou and Andreas Kakolyris and Dimosthenis Masouros and Dimitrios Soudris and Sotirios Xydis", title = "{CollectiveHLS}: a Collaborative Approach to High-Level Synthesis Design Optimization", journal = j-TRETS, volume = "18", number = "1", pages = "11:1--11:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702005", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3702005", abstract = "High-Level Synthesis (HLS) has played a pivotal role in making FPGAs accessible to a broader audience by facilitating high-level device programming and rapid microarchitecture customization through the use of directives. However, manually selecting the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fiege:2025:FCW, author = "Nicolai Fiege and Peter Zipf", title = "Fantastic Circuits and Where to Find Them --- a Holistic {ILP} Formulation for Model-Based Hardware Design", journal = j-TRETS, volume = "18", number = "1", pages = "12:1--12:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705325", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3705325", abstract = "The end of Moore's law and Dennard scaling emphasizes the need for application-specific computing architectures to achieve high resource and energy efficiency and real-time performance. The concept of a silicon compiler remains an enduring aspiration for design time reduction. In order to generate hardware implementations at register transfer level from behavioral descriptions, design automation tools must address challenging and interdependent problems, including allocation, scheduling, and binding. Additionally, manual intervention by the user is necessary to balance the resources vs. performance tradeoff via, for example, function inlining or loop unrolling/pipelining. Existing approaches typically solve these problems sequentially, compromising optimality in favor of simplicity and runtime. Here we show how to model the whole model-based design flow as one holistic integer linear programming (ILP) formulation aiming at consistently deriving the optimal microarchitecture for any given application. Incorporating clock gating minimizes the number of useless operations with negligible resource overhead (if any), while always guaranteeing optimal throughput. The unified nature of the proposed ILP model enables implementations unmatched by state-of-the-art approaches in terms of resource efficiency and measured power consumption. These results facilitate a streamlined design flow for highly optimized embedded systems in the context of model-based design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Leggett:2025:AMD, author = "Jordan Leggett and John McGlone and Suleyman Demirsoy and Christian Faerber and Vadim Pelyushenko", title = "Accelerating In-memory Database Functionality with {FPGAs}", journal = j-TRETS, volume = "18", number = "1", pages = "13:1--13:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706113", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3706113", abstract = "In this article, we present a hardware offload of part of the delta merge process used in In-Memory Databases (IMDBs). The delta merge process is fundamental in maintaining high transactional throughput for IMDBs. Improving the efficiency of the delta \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wu:2025:TPM, author = "Jiajun Wu and Mo Song and Jingmin Zhao and Yizhao Gao and Jia Li and Hayden Kwok-Hay So", title = "{TATAA}: Programmable Mixed-Precision Transformer Acceleration with a Transformable Arithmetic Architecture", journal = j-TRETS, volume = "18", number = "1", pages = "14:1--14:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3714416", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3714416", abstract = "Modern transformer-based deep neural networks present unique technical challenges for effective acceleration in real-world applications. Apart from the vast amount of linear operations needed due to their sizes, modern transformer models are increasingly reliance on precise non-linear computations that make traditional low-bitwidth quantization methods and fixed-dataflow matrix accelerators ineffective for end-to-end acceleration. To address this need to accelerate both linear and non-linear operations in a unified and programmable framework, this article introduces TATAA. TATAA employs 8-bit integer (int8) arithmetic for quantized linear layer operations through post-training quantization, while it relies on bfloat16 floating-point arithmetic to approximate non-linear layers of a transformer model. TATAA hardware features a transformable arithmetic architecture that supports both formats during runtime with minimal overhead, enabling it to switch between a systolic array mode for int8 matrix multiplications and a SIMD mode for vectorized bfloat16 operations. An end-to-end compiler is presented to enable flexible mapping from emerging transformer models to the proposed hardware. Experimental results indicate that our mixed-precision design incurs only 0.14% to 1.16% accuracy drop when compared with the pre-trained single-precision transformer models across a range of vision, language, and generative text applications. Our prototype implementation on the Alveo U280 FPGA currently achieves 2,935.2 GOPS throughput on linear layers and a maximum of 189.5 GFLOPS for non-linear operations, outperforming related works by up to in end-to-end throughput and $ 2.29 \times $ in DSP efficiency, while achieving $ 2.19 \times $ higher power efficiency than modern NVIDIA RTX4090 GPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "14", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Encinas:2025:LIM, author = "Juan Encinas and Alfonso Rodr{\'\i}guez and Andr{\'e}s Otero", title = "Leveraging Incremental Machine Learning for Reconfigurable Systems Modeling under Dynamic Workloads", journal = j-TRETS, volume = "18", number = "1", pages = "15:1--15:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715154", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 25 09:48:37 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3715154", abstract = "Dynamic workload orchestration is one of the main concerns when working with heterogeneous computing infrastructures in the edge-cloud continuum. In this context, FPGA-based computing nodes can take advantage of their improved flexibility, performance, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "15", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhou:2025:FBB, author = "Wenjie Zhou and Haoyan Qi and David Boland and Philip H. W. Leong", title = "{FPGA}-based Block Minifloat Training Accelerator for a Time Series Prediction Network", journal = j-TRETS, volume = "18", number = "2", pages = "16:1--16:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3707209", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Time series forecasting is the problem of predicting future data samples from historical information and recent deep neural network (DNNs) based techniques have achieved excellent results compared with conventional statistical approaches. Many \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "16", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{She:2025:SLP, author = "Yuhan She and Jierui Liu and Yanlong Huang and Ray C. C. Cheung and Hong Yan", title = "A Speculative Loop Pipeline Framework with Accurate Path Modeling for High-Level Synthesis", journal = j-TRETS, volume = "18", number = "2", pages = "17:1--17:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705732", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Loop pipelining is a key optimization in high-level synthesis (HLS), aimed at overlapping the execution of iterations. Static scheduling, dominant in commercial HLS tools, configures the pipeline based on compile-time analysis, proving conservative for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "17", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Brignone:2025:SAS, author = "Giovanni Brignone and Roberto Bosio and Fabrizio Ottati and Claudio Sanso{\`e} and Luciano Lavagno", title = "{SILVIA}: Automated Superword-Level Parallelism Exploitation via {HLS}-specific {LLVM} Passes for Compute-Intensive {FPGA} Accelerators", journal = j-TRETS, volume = "18", number = "2", pages = "18:1--18:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705324", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "High-level synthesis (HLS) aims at democratizing custom hardware acceleration with highly abstracted software-like descriptions. However, efficient accelerators still require substantial low-level hardware optimizations, defeating the HLS intent. In the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "18", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Bao:2025:CCH, author = "Tianyou Bao and Pengzhou He and Daisuke Fujimoto and Yuichi Hayashi and Jiafeng Xie", title = "{CHIRP}: Compact and High-Performance {FPGA} Implementation of Unified Hardware Accelerators for Ring-Binary-{LWE}-based {PQC}", journal = j-TRETS, volume = "18", number = "2", pages = "19:1--19:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715153", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Post-quantum cryptography (PQC) has drawn significant attention from the hardware design research community, especially on field-programmable gate array (FPGA) platforms. In line with this trend, in this article, we present a novel FPGA-based PQC design \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "19", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tokuda:2025:DBF, author = "Daichi Tokuda and Shinya Takamaeda-Yamazaki", title = "{DF-BETA}: an {FPGA}-based Memory Locality Aware Decision Forest Accelerator via Bit-Level Early Termination", journal = j-TRETS, volume = "18", number = "2", pages = "20:1--20:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706114", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Decision forests, particularly Gradient Boosting Decision Trees (GBDT), are popular due to their high prediction performance and computational efficiency, making them suitable for embedded systems with circuit size and available energy constraints. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "20", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Singh:2025:CNN, author = "Gaurav Singh and Kia Bazargan", title = "Compressing Neural Networks using Learnable {1D} Non-Linear Functions", journal = j-TRETS, volume = "18", number = "2", pages = "21:1--21:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705926", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "As deep learning models grow in size to achieve state-of-the-art accuracy, there is a pressing need for compact models. To address this challenge, we introduce a novel operation called Personal Self-Attention (PSA). It is specifically designed to learn \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "21", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sun:2025:FBL, author = "Mingqian Sun and Guangwei Xie and Fan Zhang and Wei Guo and Xitian Fan and Li Chen and Jiayu Du", title = "{FPGA}-Based Large-Scale Sorting with Optimized Bandwidth Utilization", journal = j-TRETS, volume = "18", number = "2", pages = "22:1--22:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716392", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Fast sorting of large-scale data is an essential task for data centers. In previous works, the existing computational model of sorting kernel still results in lower bandwidth utilization on the external memory bus. And the execution of merge operations \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "22", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ahmed:2025:MTC, author = "Muhammed Kawser Ahmed and Maximillian Panoff Kealoha and Joel Mandebi Mbongue and Sujan Kumar Saha and Erman Nghonda Tchinda and Peter Esenju Mbua and Christophe Bobda", title = "Multi-Tenant Cloud {FPGA}: a Survey on Security, Trust, and Privacy", journal = j-TRETS, volume = "18", number = "2", pages = "23:1--23:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3713078", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "With the growing demand for enhanced performance and scalability in cloud applications and systems, data center architectures are evolving to incorporate heterogeneous computing fabrics that leverage CPUs, GPUs, and FPGAs. Unlike traditional processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "23", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ploumidis:2025:EPE, author = "Manolis Ploumidis and Fabien Chaix and Nikolaos Chrysos and Marios Assiminakis and Nikolaos Kallimanis and Nikolaos Kossifidis and Michael Nikoloudakis and Nikolaos Dimou and Michalis Gianioudis and George Ieronymakis and Aggelos Ioannou and George Kalokerinos and Pantelis Xirouchakis and Astrinos Damianakis and Michael Ligerakis and Theocharis Vavouris and Manolis Katevenis and Vassilis Papaefstathiou and Manolis Marazakis and Iakovos Mavroidis", title = "The {ExaNeSt} Prototype: Evaluation of Efficient {HPC} Communication Hardware in an {ARM}-based Multi-{FPGA} Rack", journal = j-TRETS, volume = "18", number = "2", pages = "24:1--24:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715152", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "We present and evaluate the ExaNeSt prototype, which compactly packages 128 Xilinx ZU9EG MPSoCs, two TBytes of DRAM, and eight TBytes of SSD into a liquid-cooled rack, using a custom interconnection hardware based on 10 GB/s links. We developed this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "24", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sugiura:2025:FAC, author = "Keisuke Sugiura and Hiroki Matsutani", title = "{FPGA}-accelerated Correspondence-free Point Cloud Registration with {PointNet} Features", journal = j-TRETS, volume = "18", number = "2", pages = "25:1--25:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3717836", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Point cloud registration serves as a basis for vision and robotic applications including 3D reconstruction and mapping. Despite significant improvements on the quality of results, recent deep learning approaches are computationally expensive and power-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "25", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tavva:2025:CCB, author = "Yaswanth Tavva and Rohan Juneja and Trevor E. Carlson and Li-Shiuan Peh", title = "{CTScan}: a {CGRA}-based Platform for the Emulation of Power Side-Channel Attacks on Edge {CPUs}", journal = j-TRETS, volume = "18", number = "2", pages = "26:1--26:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721294", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Cryptographic algorithms can be exploited by power side-channel attacks. Thus, it is imperative to perform a thorough pre-silicon security evaluation to minimize these potential threats. Conventional methods using FPGAs and CAD tools for pre-silicon \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "26", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hao:2025:PGH, author = "Xiaochen Hao and Mingzhe Zhang and Ce Sun and Zhuofu Tao and Hongbo Rong and Yu Zhang and Lei He and Eric Petit and Wenguang Chen and Yun Liang", title = "Productively Generating a High-Performance Linear Algebra Library on {FPGAs}", journal = j-TRETS, volume = "18", number = "2", pages = "27:1--27:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723046", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Linear algebra computations can be greatly accelerated using spatial accelerators on FPGAs. As a standard building block of linear algebra applications, BLAS covers a wide range of compute patterns that vary vastly in data reuse, bottleneck resources, matrix storage layouts, and data types. However, existing implementations of BLAS routines on FPGAs are stuck in the dilemma of productivity and performance. They either require extensive human effort or fail to leverage the properties of routines for acceleration.\par We introduce Lasa, a framework composed of a programming model and a compiler, designed to address the dilemma by abstracting (for productivity) and specializing (for performance) the architecture of a spatial accelerator. The programming model realizes systolic arrays using uniform recurrence equations and space-time transforms. Streaming tensors, an intuitive dataflow-style abstraction, is proposed to uniformly describe the movement, storage, and transpose of input and output data across the spatial components. According to streaming tensors, a customized memory hierarchy is automatically built on an FPGA by our compiler. The compiler further specializes the architecture with transparent optimizations on FPGAs. Using this framework, we develop a complete BLAS library, demonstrating performance in parity with expert-written HLS code for BLAS level 3 routines, 76\%--94\% machine peak for level 1 and 2 routines, and 1.6X--13X speedup by leveraging the matrix properties such as symmetry, triangularity, and bandness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "27", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mohtavipour:2025:PPR, author = "Seyed Mehdi Mohtavipour and Hadi Shahriar Shahhoseini", title = "{PRISA}: a Potential Region-based Intelligent Search Algorithm for Dataflow Graph Mapping in Spatial {CGRAs}", journal = j-TRETS, volume = "18", number = "2", pages = "28:1--28:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723045", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Coarse-grained Reconfigurable Architectures (CGRAs) offer energy efficiency and programmability, making them integral to modern high-performance computing. However, complicated compilation when mapping the Dataflow Graph (DFG) to CGRA components leads to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "28", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Feng:2025:HTT, author = "Yongkang Feng and Liang Yao and Hongli Zhou and Minjie Wu and Shuai Xiang and Wanting Sun and Xiumin Xu and Yingchun Lu", title = "High-Throughput {TRNG} Design with Novelty Adjustable {TDC} Based on {STR}", journal = j-TRETS, volume = "18", number = "2", pages = "29:1--29:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722118", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In IoT devices, True Random Number Generators (TRNGs) play an increasingly important role, and advanced TRNGs must possess high throughput, low resource overhead, and high stability. In this article, we propose a fine-grained entropy extraction circuit based on Self-Timed Ring (STR), which can change the entropy extraction capability by varying the stages of STRs to extract randomness from different entropy sources. Importantly, the throughput of the proposed TRNG can be automatically adjusted according to the frequency of the entropy source, adapting to user requirements. The proposed TRNG is validated on Xilinx Spartan-6, Xilinx Artix-7, and Xilinx Virtex-6 FPGA development boards. It utilizes a three-stage Ring Oscillator (RO) and a five-stage RO for entropy extraction, requiring only 53 LUTs, 32 DFFs, and 62 registers. The generated random numbers of the TRNG, without any post-processing, achieve excellent results in NIST SP 800-22, NIST SP 800-90B, robustness test, universality test, AIS-31, and TEST U01, demonstrating a throughput of 280 Mbps.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "29", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zuo:2025:ALD, author = "Cheng Zuo and Chang Wu", title = "Algorithmic-Level Design Partitioning for Latency Minimization in Multi-Chip and Multi-Die Systems", journal = j-TRETS, volume = "18", number = "2", pages = "30:1--30:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727646", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Jun 14 15:28:47 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "In the post-Moore's Law era, multi-chip and chiplet designs have become important trends. However, due to the limited inter-chip routing resources and large delays, partition a design into such multi-chips may lead to performance degradation. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "30", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Betz:2025:EMN, author = "Vaughn Betz", title = "Editorial: a Message from the New {Editor-in-Chief}", journal = j-TRETS, volume = "18", number = "3", pages = "31e:1--31e:2", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3765289", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31e", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fang:2025:ISI, author = "Zhenman Fang", title = "Introduction to the Special Issue on {RAW 2024}", journal = j-TRETS, volume = "18", number = "3", pages = "31:1--31:2", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3742478", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "31", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Venere:2025:RQQ, author = "Marco Venere and Beatrice Branchini and Davide Conficconi and Donatella Sciuto and Marco D. Santambrogio", title = "Rock the {QASBA}: Quantum Error Correction Acceleration via the {Sparse Blossom Algorithm} on {FPGAs}", journal = j-TRETS, volume = "18", number = "3", pages = "32:1--32:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723168", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "32", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Valentino:2025:QFU, author = "Federico Valentino and Beatrice Branchini and Davide Conficconi and Donatella Sciuto and Marco D. Santambrogio", title = "{QUEKUF}: an {FPGA} Union Find Decoder for Quantum Error Correction on the Toric Code", journal = j-TRETS, volume = "18", number = "3", pages = "33:1--33:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3733239", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "33", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Peverelli:2025:DFB, author = "Francesco Peverelli and Daniele Paletti and Davide Conficconi", title = "{DFlows}: a Flow-Based Programming Approach for a Polyglot Design-Space Exploration Framework", journal = j-TRETS, volume = "18", number = "3", pages = "34:1--34:32", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3717837", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "34", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhu:2025:DFD, author = "Guowei Zhu and Liming Deng and Kaisen Zhang and Wang Fan and Boyin Jin and Wei Cao and Fengzhe Zhang and Xuegong Zhou and Fan Zhang and Xinsheng Yu", title = "{DVHetero}: a Framework for Designing and Validating Heterogeneous {SoC} with {RISC-V} Processor and {CGRA}", journal = j-TRETS, volume = "18", number = "3", pages = "35:1--35:30", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3733721", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "35", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Yang:2025:RRL, author = "Moucheng Yang and Chengyu Zeng and Kaixiang Zhu and Lingli Wang", title = "{RLUT}: a Reduced {LUT} Architecture with Fine-Grained Scalability and Its Automatic Design Flow for Large Frequent Functions", journal = j-TRETS, volume = "18", number = "3", pages = "36:1--36:32", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3737291", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "36", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2025:MFB, author = "Shaoqiang Lu and Tiandong Zhao and Ting-Jung Lin and Rumin Zhang and Chen Wu and Lei He", title = "{MCoreOPU}: an {FPGA}-based Multi-Core Overlay Processor for Transformer-based Models", journal = j-TRETS, volume = "18", number = "3", pages = "37:1--37:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3742437", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "37", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xu:2025:SND, author = "Jinwei Xu and Jingfei Jiang and Lei Gao and Xifu Qian and Yong Dou", title = "{SPDFA}: a Novel Dataflow Fusion Sparse Deep Neural Network Accelerator", journal = j-TRETS, volume = "18", number = "3", pages = "38:1--38:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3737462", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "38", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Elgammal:2025:VOS, author = "Mohamed A. Elgammal and Amin Mohaghegh and Soheil Gholami Shahrouz and Fatemehsadat Mahmoudi and Fahrican Kosar and Kimia Talaei and Joshua Fife and Daniel Khadivi and Kevin Murray and Andrew Boutros and Kenneth B. Kent and Jeff Goeders and Vaughn Betz", title = "{VTR 9}: Open-Source {CAD} for Fabric and Beyond {FPGA} Architecture Exploration", journal = j-TRETS, volume = "18", number = "3", pages = "39:1--39:53", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734798", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", note = "See corrigendum \cite{Elgammal:2025:CVO}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "39", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Fan:2025:DUD, author = "Zimeng Fan and Min Peng", title = "{DGMF}: a Unified Dynamic Mapping Framework for Graph Neural Networks", journal = j-TRETS, volume = "18", number = "3", pages = "40:1--40:30", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744345", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "40", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Roks:2025:APA, author = "Geert Roks and Mario Ruiz Noguera and Nikolaos Alachiotis", title = "Accelerated Phylogenetics on the {AMD} Versal Adaptive {SoC}", journal = j-TRETS, volume = "18", number = "3", pages = "41:1--41:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747592", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "41", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ibrahim:2025:VVF, author = "Mustafa Ibrahim and Sebastien Pillement and Andrea Pinna and Sebastien {Le Nours}", title = "{VERSATILE}: Very Fast Partial Reconfiguration Controller", journal = j-TRETS, volume = "18", number = "3", pages = "42:1--42:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748728", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:10 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "42", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lapotre:2025:ECH, author = "Vianney Lap{\^o}tre and Cyrille Chavet and Ghita Harcha and Philippe Coussy", title = "Exploring the Contribution of Hardware Shuffling in Securing Low-Cost Symmetric Encryption Devices against Power-Based Side-Channel Attacks: Case Study of an {AES-128} on {FPGA}", journal = j-TRETS, volume = "18", number = "4", pages = "43:1--43:18", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3758100", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "43", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Gottschaldt:2025:THL, author = "Paul Gottschaldt and Ariel Podlubne and Diana G{\"o}hringer", title = "A Taxonomy of the High-Level Synthesis Ecosystem for Heterogeneous {FPGA} Systems", journal = j-TRETS, volume = "18", number = "4", pages = "44:1--44:33", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764664", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "44", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Debnath:2025:GER, author = "Mukta Debnath and Animesh Basak Chowdhury and Debasri Saha and Susmita Sur-Kolay", title = "{GreyConE+}: Efficient Rare-Target Test Generation for {FPGA HLS} Designs", journal = j-TRETS, volume = "18", number = "4", pages = "45:1--45:29", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769295", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "45", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Kabir:2025:VDL, author = "MD Arafat Kabir and Nathaniel Fredricks and Tendayi Kamucheka and Joel Mandebi and Miaoqing Huang and Jason D. Bakos and David Andrews", title = "{DA-VinCi}: a Deep-Learning Accelerator Overlay Using In-Memory Computing", journal = j-TRETS, volume = "18", number = "4", pages = "46:1--46:38", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770756", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "46", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Rajashekar:2025:MHM, author = "Manoj Bheemasandra Rajashekar and Akhil Raj Baranwal and Xingyu Tian and Zhenman Fang", title = "{MAD-HiSpMV}: Matrix Adaptive Design with Hybrid Row Distribution for Imbalanced {SpMV} Acceleration on {FPGAs}", journal = j-TRETS, volume = "18", number = "4", pages = "47:1--47:31", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3772082", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "47", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Shu:2025:LLC, author = "Mingyu Shu and Qiang Liu", title = "{LHAM}: Low-Cost and High-Accuracy Approximate Multiplier for {FPGA}-Based Computing", journal = j-TRETS, volume = "18", number = "4", pages = "48:1--48:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770757", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "48", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Deng:2025:EER, author = "Qi Deng and Hao Sun and Yuhao Shu and Jianzhong Xiao and Weixiong Jiang and Hui Wang and Yajun Ha", title = "An Energy-Efficient and Real-Time {FPGA-Based} Point Cloud Registration Framework with Ultra-Fast and Configurable Multi-Mode Correspondence Search", journal = j-TRETS, volume = "18", number = "4", pages = "49:1--49:30", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771768", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "49", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Lu:2025:IIO, author = "Tianshuo Lu and Jianyang Ding and Bowen Jiang and Huachen Zhang and Wei Xu and Zhilei Chai", title = "{ISRLUT}: Integer-Only {FHD} Image Super-Resolution Based on Neural Lookup Table and Near-Memory Computing", journal = j-TRETS, volume = "18", number = "4", pages = "50:1--50:31", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770759", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "50", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Deng:2025:AAD, author = "Liming Deng and Guowei Zhu and Xitian Fan and Wei Cao and Xuegong Zhou and Fan Zhang and Shaobo Yang", title = "{AHCA}: Agile Design Framework for Hashcat Acceleration Based on {FPGA}", journal = j-TRETS, volume = "18", number = "4", pages = "51:1--51:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770760", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "51", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Tu:2025:EEF, author = "Yazheng Tu and Jiafeng Xie", title = "{EMINEM}: Efficient {FPGA} Implementation of Mixed-{RadIx} {NTT} Hardware {AccElerators} for {NIST} Post-{QuantuM} Cryptography {Falcon}, {Dilithium}, and {HAWK}", journal = j-TRETS, volume = "18", number = "4", pages = "52:1--52:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771287", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "52", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Baranwal:2025:PET, author = "Akhil Raj Baranwal and Zhenman Fang", title = "{PoCo}: Extending Task-Parallel {HLS} Programming with Shared Multi-Producer Multi-Consumer Buffer Support", journal = j-TRETS, volume = "18", number = "4", pages = "53:1--53:33", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771938", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "53", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Hu:2025:HHP, author = "Qilin Hu and Haotian Wang and Chubo Liu and Keqin Li and Kenli Li", title = "{HiFA}: a High-Performance and Flexible Acceleration Framework for Large-Size Number Theoretic Transform", journal = j-TRETS, volume = "18", number = "4", pages = "54:1--54:32", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771769", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "54", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abdurakhmanov:2025:EMM, author = "Abdurauf Abdurakhmanov and Suhaib A. Fahmy", title = "Exploring Microscaling {MX} Minifloat Systolic Arrays on {FPGAs}", journal = j-TRETS, volume = "18", number = "4", pages = "55:1--55:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3773041", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "55", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Abbasi:2025:OMH, author = "Ali Abbasi and Danesh Germchi and Amin Katani and Mohamed Hassan and Rodolfo Pellizzoni", title = "{OpenDRAM}: a Modular, High-performance Soft Memory Controller for {DDR4 DRAM}", journal = j-TRETS, volume = "18", number = "4", pages = "56:1--56:27", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3772724", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "56", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Elgammal:2025:CVO, author = "Mohamed A. Elgammal and Amin Mohaghegh and Soheil Gholami Shahrouz and Fatemehsadat Mahmoudi and Fahrican Kosar and Kimia Talaei and Joshua Fife and Daniel Khadivi and Kevin Murray and Andrew Boutros and Kenneth B. Kent and Jeff Goeders and Vaughn Betz", title = "Corrigendum: {VTR 9}: Open-Source {CAD} for Fabric and Beyond {FPGA} Architecture Exploration", journal = j-TRETS, volume = "18", number = "4", pages = "C1:1--C1:2", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3778036", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Dec 23 07:21:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", note = "See \cite{Elgammal:2025:VOS}.", abstract = "This is a corrigendum for the article ``VTR 9: Open-Source CAD for Fabric and Beyond FPGA Architecture Exploration'' published in ACM Trans. Reconfig. Technol. Syst. 18, 3, Article 39 (August 2025), 53 pages.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "C1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Xu:2026:DRE, author = "Weihai Xu and Jin Zhang and Zheng Zhou and Hang Yang and Shifeng Huang and Yin Tang and Yiming Jiang and Jiangxing Wu", title = "On-Demand Regular Expression Matching on {FPGAs} for Efficient Deep Packet Inspection", journal = j-TRETS, volume = "19", number = "1", pages = "1:1--1:32", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3774651", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Deep Packet Inspection (DPI) faces significant bottlenecks in regular expression (regex) matching due to escalating rule complexity and traffic volume. Existing FPGA-based solutions inefficiently process all packets through every automaton, incurring \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "1", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Liang:2026:THT, author = "Zhuorong Liang and Siqi Deng and Tao Su", title = "{TernaryGNNs}: a High-Throughput, Area-Efficient Ternary Weight {GNNs} Inference Framework on {CPU--FPGA} Heterogeneous Platform", journal = j-TRETS, volume = "19", number = "1", pages = "2:1--2:31", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3776568", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Graph Neural Networks (GNNs) have achieved remarkable success in recent years due to their powerful ability to model non-Euclidean data structures and complex relationships. However, as graph sizes and model complexities continue to grow, the efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "2", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Mao:2026:NST, author = "Xiaoyu Mao and Qinming Zhou and Xiao Chen and Tao Su", title = "Novel Security Threats in Multi-Tenant {FPGAs}: Phase Tuning for Voltage Sensors in Remote Power Side-Channel Analysis Attacks on {AES}", journal = j-TRETS, volume = "19", number = "1", pages = "3:1--3:28", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779437", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "The rapid advancement of cloud computing has facilitated the widespread integration of field-programmable gate arrays (FPGAs) into cloud servers to accelerate computing processes. To reduce costs, cloud service providers aim to promote multi-tenant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "3", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wadood:2026:FHT, author = "Abdul Wadood and Alec Lu and Haikai Zhao and Zhenman Fang", title = "{FORCv2}: a High-Throughput Streaming {FPGA} Accelerator for Optimized Row Columnar File Format Processing in Big Data Engines", journal = j-TRETS, volume = "19", number = "1", pages = "4:1--4:33", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787488", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "To enhance the storage efficiency of large datasets, Big Data analytics commonly rely on columnar file formats, such as Apache ORC (Optimized Row Columnar), to encode and compress data. These formats significantly reduce storage requirements and improve \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Arribas:2026:HPR, author = "Miguel Jim{\'e}nez Arribas and Agust{\'\i}n Mart{\'\i}nez Hell{\'\i}n and Manuel Prieto Mateo", title = "High-Performance {RISC-V} {CSR} Access in {FPGAs}: Optimized Microarchitecture for Efficient Decoding and Multiplexing", journal = j-TRETS, volume = "19", number = "1", pages = "5:1--5:34", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787491", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Control and Status Registers (CSRs) are fundamental to the RISC-V architecture, providing a versatile interface for managing processor state, system configuration, and functionality like exception handling, debugging, and performance monitoring. However, their implementation in FPGA-based systems poses challenges due to the inherent constraints of FPGA resources. The need for atomic parallel access to all CSRs, coupled with limitations of conventional LUT-based multiplexing, increases logic depth and resource usage, degrading Fmax in large-scale implementations. This work explores these obstacles and introduces optimized mechanisms to improve CSR handling efficiency. A minimal microarchitectural environment was built to isolate and evaluate multiple access strategies, retaining only components essential to CSR interaction. Leveraging a heterogeneous design tailored to FPGA capabilities --- featuring BRAM for decoding, DSPs for multiplexing, and LUTs, CARRYs and flip-flops to facilitate routing --- the proposed CSR subsystem achieves performance enhancements ranging from 50% to over 300%, contingent upon configuration. The top-performing implementation reaches 250 MHz on Artix-7 FPGAs while simultaneously reducing area and dynamic power. These results challenge prevailing reliance on abstract electronic design, highlighting that hardware-aware low-level methodologies can yield superior quality-of-results (QoR) over pure behavioral descriptions. More broadly, the findings inform datapath optimization involving decoding and multiplexing in performance-critical and resource-constrained digital architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "5", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wang:2026:ZOT, author = "Wenjie Wang and Bo Peng and Jianguo Yao and Haibing Guan", title = "{Zero2M}: Optimizing Tenant-Level {I/O} Management for Future Faster {NVMe} Storage with {FPGA}", journal = j-TRETS, volume = "19", number = "1", pages = "6:1--6:34", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787489", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "High-speed Non-Volatile Memory Express (NVMe) Solid-State Drives (SSDs) are shared by multiple tenants in cloud scenarios to improve resource utilization. Tenant-level I/O management is necessary to achieve reliable QoS control during sharing. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "6", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2026:MTM, author = "Yuan Zhang and Jiliang Zhang", title = "{MCT-TRNG}: Multi-Channel Tetrahedral {TRNG} via Metastability-Enhanced Entropy with {2.2 Gbps} Throughput", journal = j-TRETS, volume = "19", number = "1", pages = "7:1--7:21", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779443", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "True random number generators (TRNGs) extract randomness from physical phenomena to produce inherently unpredictable bitstreams. Owing to their strong cryptographic properties, TRNGs are fundamental components for establishing trusted roots in secure systems. However, the throughput of current TRNGs falls short of meeting the increasing demands posed by high-speed encryption and rapidly growing data volumes. To solve this issue, in this article, we propose a multi-channel tetrahedral TRNG (MCT-TRNG) via metastability-enhanced entropy with ultra-high throughput. We first propose a novel entropy source structure of a metastability-enhanced ring oscillator, which extracts randomness in unstable signals by switching transmission paths. Then, we introduce a feedback XOR ring to improve the degree of signal chaos. On this basis, we propose a tetrahedral post-processing structure with four channels to produce independent parallel outputs. The experiments show that the generated random sequences have successfully passed the NIST and AIS-31 tests. The MCT-TRNG incurs only 18 LUTs with a throughput of 2.2 Gbps on Xilinx Artix-7 FPGA. Compared with existing works, our design has the highest throughput, yielding promising application potential.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "7", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Ma:2026:CLH, author = "Wenheng Ma and Xinhao Yang and Shulin Zeng and Tengxuan Liu and Libo Shen and Hongyi Wang and Shiyao Li and Ke Hong and Zhenhua Zhu and Xuefei Ning and Tsung-Yi Ho and Guohao Dai and Yu Wang", title = "{CD-LLM}: a Heterogeneous Multi-{FPGA} System for Batched Decoding of {70B+} {LLMs} Using a Compute-Dedicated Architecture", journal = j-TRETS, volume = "19", number = "1", pages = "8:1--8:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3771288", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Large Language Models (LLMs) with 70 billion or more parameters are increasingly being deployed in cloud-based Model-as-a-Service (MaaS) scenarios. To meet the demands of such deployments, MaaS providers require batched LLM decoding systems that can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "8", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sunketa:2026:ONA, author = "Ruthwik Reddy Sunketa and Muhammad Ali Farooq and Ganesh Gore and Allen Boston and Pierre-Emmanuel Gaillardon and Aman Arora", title = "{OpenFPGA-NoC}: Automated Fabric and Bitstream Generation for {NoC}-based {FPGAs}", journal = j-TRETS, volume = "19", number = "1", pages = "9:1--9:32", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779449", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "As the demand for high-performance and flexible hardware accelerators increases, Network-on-Chip (NoC)-based Field Programmable Gate Arrays (FPGAs) offer a scalable solution for complex, data-intensive applications. While commercial FPGA vendors like \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "9", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Baroughi:2026:HHP, author = "Ahmad Sedigh Baroughi and Manoj Bheemasandra Rajashekar and Akhil Raj Baranwal and Zhenman Fang", title = "{HiSpMM}: High Performance High Bandwidth Sparse-Dense Matrix Multiplication on {HBM}-equipped {FPGAs}", journal = j-TRETS, volume = "19", number = "1", pages = "10:1--10:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3774327", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Sparse Matrix-Dense Matrix Multiplication (SpMM) is a critical operation in scientific computing, machine learning, and graph analytics. However, accelerating SpMM on FPGAs presents major challenges due to irregular memory access patterns and imbalanced \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "10", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Teng:2026:TNF, author = "Wenbin Teng and Wenqi Lou and Teng Wang and Lei Gong and Chao Wang and Xuehai Zhou", title = "{TETRIS}: a Novel {FPGA} Virtualization Framework for Fine-grained Sharing via Hierarchical Reconfiguration", journal = j-TRETS, volume = "19", number = "1", pages = "11:1--11:32", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779447", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Field-Programmable Gate Arrays (FPGAs) are increasingly used in cloud platforms to accelerate diverse workloads, thanks to their reconfigurability and high performance. However, in multi-tenant cloud environments, existing FPGA virtualization mechanisms \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "11", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Zhang:2026:REV, author = "Wenbo Zhang and Yan Zhang and Yiqi Liu and Lingjie Wu and Xingtong Hu", title = "{REATA}: an Efficient Vision Transformer Accelerator Featuring a Resource-Optimized Attention Design on Versal {ACAP}", journal = j-TRETS, volume = "19", number = "1", pages = "12:1--12:32", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779444", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Deploying Vision Transformers (ViTs) on edge devices poses significant challenges due to their high computational demands and memory access overheads, which severely hinder real-time inference efficiency. This article proposes a modular and adaptive ViT \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "12", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Sun:2026:DDA, author = "Chang Sun and Zhiqiang Que and Vladimir Loncar and Wayne Luk and Maria Spiropulu", title = "{\tt da4ml}: Distributed Arithmetic for Real-time Neural Networks on {FPGAs}", journal = j-TRETS, volume = "19", number = "1", pages = "13:1--13:27", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777387", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Tue Mar 17 15:14:49 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib", abstract = "Neural networks with a latency requirement on the order of microseconds, like the ones used at the CERN Large Hadron Collider, are typically deployed on FPGAs fully unrolled and pipelined. A bottleneck for the deployment of such neural networks is area utilization, which is directly related to the required constant matrix--vector multiplication (CMVM) operations. In this work, we propose an efficient algorithm for implementing CMVM operations with distributed arithmetic on FPGAs that simultaneously optimizes for area consumption and latency. The algorithm achieves resource reduction similar to state-of-the-art algorithms while being significantly faster to compute. The proposed algorithm is open sourced and integrated into the hls4ml library, a free and open source library for running real-time neural network inference on FPGAs. We show that the proposed algorithm can reduce on-chip resources by up to a third for realistic, highly quantized neural networks while simultaneously reducing latency, enabling the implementation of previously infeasible networks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "13", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", }