%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.83", %%% date = "02 February 2026", %%% time = "08:39:20 MDT", %%% filename = "tomccap.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "60048 69100 339097 3285251", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "bibliography; BibTeX; ACM Transactions on %%% Multimedia Computing, Communications, and %%% Applications; TOMCCAP; TOMM", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Multimedia Computing, %%% Communications, and Applications (CODEN %%% ????, ISSN 1551-6857), completely covering %%% all issues from volume 1, number 1, February %%% 2005 to date. %%% %%% NB: On 23-May-2014, the journal acronym was %%% changed by ACM from TOMCCAP to TOMM, but the %%% full journal name remains unchanged, and %%% volume / number / pages values are not %%% affected by the change. The BibTeX journal %%% abbreviation has therefore changed at volume %%% 10, number 4, June 2014, from j-TOMMCAP to %%% j-TOMM. The filename remains tommcap.bib; no %%% tomm.bib exists at the master archive site. %%% %%% The ACM maintains World Wide Web pages with %%% journal tables of contents for 2005--date at %%% %%% http://www.acm.org/tomccap/ %%% http://www.acm.org/pubs/contents/journals/tomccap/ %%% http://portal.acm.org/browse_dl.cfm?idx=J961 %%% %%% That data has been automatically converted to %%% BibTeX form, corrected for spelling and page %%% number errors, and merged into this file. %%% %%% At version 1.83, the COMPLETE year coverage %%% looks like this: %%% %%% 2005 ( 20) 2013 ( 62) 2021 ( 168) %%% 2006 ( 18) 2014 ( 55) 2022 ( 153) %%% 2007 ( 27) 2015 ( 51) 2023 ( 209) %%% 2008 ( 45) 2016 ( 31) 2024 ( 302) %%% 2009 ( 14) 2017 ( 63) 2025 ( 308) %%% 2010 ( 31) 2018 ( 75) 2026 ( 30) %%% 2011 ( 41) 2019 ( 62) %%% 2012 ( 56) 2020 ( 122) %%% %%% Article: 1943 %%% %%% Total entries: 1943 %%% %%% Spelling has been verified with the UNIX %%% spell and GNU ispell programs using the %%% exception dictionary stored in the companion %%% file with extension .sok. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed by the %%% author for the BibNet Project. %%% %%% In this bibliography, entries are sorted %%% by journal, and then by publication order, %%% with the help of ``bibsort -byvolume''. The %%% bibsort utility is available from %%% ftp://ftp.math.utah.edu/pub/tex/bib. %%% %%% The author will be grateful for reports of %%% errors of any kind in this bibliography. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\ifx \Thorn \undefined \def \Thorn {T}\fi" # "\hyphenation{ }" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% From the ACM Portal Web site: ``On 23rd May 2014, ACM TOMCCAP %%% changed its acronym to ACM TOMM. This acronym change was the result %%% of extensive discussions between the journal Editorial Board and %%% SIGMM constituents dating back to 2011. This name change emphasizes %%% the continued strong collaboration with the ACM Multimedia %%% conference (ACMMM).'' %%% %%% Journal abbreviations: @String{j-TOMCCAP = "ACM Transactions on Multimedia Computing, Communications, and Applications"} @String{j-TOMM = "ACM Transactions on Multimedia Computing, Communications, and Applications"} %%% ==================================================================== %%% Bibliography entries sorted in publication order: @Article{Georganas:2005:EBA, author = "Nicolas D. Georganas", title = "{Editorial}: {The} birth of the {ACM Transactions on Multimedia Computing, Communications and Applications} {(TOMCCAP)}", journal = j-TOMCCAP, volume = "1", number = "1", pages = "1--2", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rowe:2005:ASR, author = "Lawrence A. Rowe and Ramesh Jain", title = "{ACM SIGMM Retreat} report on future directions in multimedia research", journal = j-TOMCCAP, volume = "1", number = "1", pages = "3--13", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jain:2005:GEI, author = "Ramesh Jain and Thomas Plagemann and Ralf Steinmetz", title = "Guest editorial: {The International ACM Multimedia Conference 1993} --- ten years after", journal = j-TOMCCAP, volume = "1", number = "1", pages = "14--15", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Teodosio:2005:SS, author = "Laura Teodosio and Walter Bender", title = "Salient stills", journal = j-TOMCCAP, volume = "1", number = "1", pages = "16--36", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Reddy:2005:DSM, author = "A. L. N. Reddy and Jim Wyllie and K. B. R. Wijayaratne", title = "Disk scheduling in a multimedia {I/O} system", journal = j-TOMCCAP, volume = "1", number = "1", pages = "37--59", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Buchanan:2005:ATL, author = "M. Cecelia Buchanan and Polle T. Zellweger", title = "Automatic temporal layout mechanisms revisited", journal = j-TOMCCAP, volume = "1", number = "1", pages = "60--88", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bulterman:2005:SMA, author = "Dick C. A. Bulterman and Lynda Hardman", title = "Structured multimedia authoring", journal = j-TOMCCAP, volume = "1", number = "1", pages = "89--109", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mayer-Patel:2005:BSM, author = "Ketan Mayer-Patel and Brian C. Smith and Lawrence A. Rowe", title = "The {Berkeley} software {MPEG-1} video decoder", journal = j-TOMCCAP, volume = "1", number = "1", pages = "110--125", month = feb, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Apr 14 11:01:03 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Plagemann:2005:SPA, author = "Thomas Plagemann and Prashant Shenoy and John R. Smith", title = "Selected papers from the {ACM Multimedia Conference 2003}", journal = j-TOMCCAP, volume = "1", number = "2", pages = "127--127", month = may, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Jul 7 13:52:13 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kum:2005:RTM, author = "Sang-Uok Kum and Ketan Mayer-Patel", title = "Real-time multidepth stream compression", journal = j-TOMCCAP, volume = "1", number = "2", pages = "128--150", month = may, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Jul 7 13:52:13 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Feng:2005:PSL, author = "Wu-Chi Feng and Ed Kaiser and Wu Chang Feng and Mikael Le Baillif", title = "{Panoptes}: scalable low-power video sensor networking technologies", journal = j-TOMCCAP, volume = "1", number = "2", pages = "151--167", month = may, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Jul 7 13:52:13 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Goh:2005:SFD, author = "Kingshy Goh and Beitao Li and Edward Y. Chang", title = "Semantics and feature discovery via confidence-based ensemble", journal = j-TOMCCAP, volume = "1", number = "2", pages = "168--189", month = may, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Jul 7 13:52:13 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Baker:2005:UPC, author = "H. Harlyn Baker and Nina Bhatti and Donald Tanguay and Irwin Sobel and Dan Gelb and Michael E. Goss and W. Bruce Culbertson and Thomas Malzbender", title = "Understanding performance in {Coliseum}, an immersive videoconferencing system", journal = j-TOMCCAP, volume = "1", number = "2", pages = "190--210", month = may, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Jul 7 13:52:13 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Adams:2005:IIM, author = "Brett Adams and Svetha Venkatesh and Ramesh Jain", title = "{IMCE}: {Integrated} media creation environment", journal = j-TOMCCAP, volume = "1", number = "3", pages = "211--247", month = aug, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Fri Nov 18 08:30:19 MST 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Poellabauer:2005:FCD, author = "Christian Poellabauer and Karsten Schwan", title = "Flexible cross-domain event delivery for quality-managed multimedia applications", journal = j-TOMCCAP, volume = "1", number = "3", pages = "248--268", month = aug, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Fri Nov 18 08:30:19 MST 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cooper:2005:TEC, author = "Matthew Cooper and Jonathan Foote and Andreas Girgensohn and Lynn Wilcox", title = "Temporal event clustering for digital photo collections", journal = j-TOMCCAP, volume = "1", number = "3", pages = "269--288", month = aug, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Fri Nov 18 08:30:19 MST 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2005:CEM, author = "Keqiu Li and Hong Shen", title = "Coordinated enroute multimedia object caching in transcoding proxies for tree networks", journal = j-TOMCCAP, volume = "1", number = "3", pages = "289--314", month = aug, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Fri Nov 18 08:30:19 MST 2005", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2005:AFE, author = "Huahui Wu and Mark Claypool and Robert Kinicki", title = "Adjusting forward error correction with temporal scaling for {TCP}-friendly streaming {MPEG}", journal = j-TOMCCAP, volume = "1", number = "4", pages = "315--337", month = nov, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cai:2005:LUL, author = "Jianfei Cai and Xiangjun Li and Chang Wen Chen", title = "Layered unequal loss protection with pre-interleaving for fast progressive image transmission over packet-loss channels", journal = j-TOMCCAP, volume = "1", number = "4", pages = "338--353", month = nov, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tu:2005:ASP, author = "Yi-Cheng Tu and Jianzhong Sun and Mohamed Hefeeda and Sunil Prabhakar", title = "An analytical study of peer-to-peer media streaming systems", journal = j-TOMCCAP, volume = "1", number = "4", pages = "354--376", month = nov, year = "2005", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lew:2006:CBM, author = "Michael S. Lew and Nicu Sebe and Chabane Djeraba and Ramesh Jain", title = "Content-based multimedia information retrieval: {State} of the art and challenges", journal = j-TOMCCAP, volume = "2", number = "1", pages = "1--19", month = feb, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{DelBimbo:2006:CBR, author = "Alberto {Del Bimbo} and Pietro Pala", title = "Content-based retrieval of {$3$D} models", journal = j-TOMCCAP, volume = "2", number = "1", pages = "20--43", month = feb, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Xu:2006:FAF, author = "Huaxin Xu and Tat-Seng Chua", title = "Fusion of {AV} features and external information sources for event detection in team sports video", journal = j-TOMCCAP, volume = "2", number = "1", pages = "44--67", month = feb, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Joshi:2006:SPE, author = "Dhiraj Joshi and James Z. Wang and Jia Li", title = "The {Story Picturing Engine}---a system for automatic text illustration", journal = j-TOMCCAP, volume = "2", number = "1", pages = "68--89", month = feb, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Snoek:2006:LRS, author = "Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann", title = "Learning rich semantics from news video archives by style analysis", journal = j-TOMCCAP, volume = "2", number = "2", pages = "91--108", month = may, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2006:SER, author = "Guang Yang and Tony Sun and Mario Gerla and M. Y. Sanadidi and Ling-Jyh Chen", title = "Smooth and efficient real-time video transport in the presence of wireless errors", journal = j-TOMCCAP, volume = "2", number = "2", pages = "109--126", month = may, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shao:2006:ASM, author = "Xi Shao and Changsheng Xu and Namunu C. Maddage and Qi Tian and Mohan S. Kankanhalli and Jesse S. Jin", title = "Automatic summarization of music videos", journal = j-TOMCCAP, volume = "2", number = "2", pages = "127--148", month = may, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Eide:2006:RTV, author = "Viktor S. Wold Eide and Ole-Christoffer Granmo and Frank Eliassen and J{\o}rgen Andreas Michaelsen", title = "Real-time video content analysis: {QoS}-aware application composition and parallel processing", journal = j-TOMCCAP, volume = "2", number = "2", pages = "149--172", month = may, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Candan:2006:ISI, author = "K. Sel{\c{c}}uk Candan and Augusto Celentano and Wolfgang Klas", title = "Introduction to special issue on the use of context in multimedia information systems", journal = j-TOMCCAP, volume = "2", number = "3", pages = "173--176", month = aug, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ferrara:2006:SWO, author = "Alfio Ferrara and Luca A. Ludovico and Stefano Montanelli and Silvana Castano and Goffredo Haus", title = "A {Semantic Web} ontology for context-based classification and retrieval of music resources", journal = j-TOMCCAP, volume = "2", number = "3", pages = "177--198", month = aug, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Arigon:2006:HMP, author = "Anne-Muriel Arigon and Anne Tchounikine and Maryvonne Miquel", title = "Handling multiple points of view in a multimedia data warehouse", journal = j-TOMCCAP, volume = "2", number = "3", pages = "199--218", month = aug, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kahol:2006:MCH, author = "Kanav Kahol and Priyamvada Tripathi and Troy Mcdaniel and Laura Bratton and Sethuraman Panchanathan", title = "Modeling context in haptic perception, rendering, and visualization", journal = j-TOMCCAP, volume = "2", number = "3", pages = "219--240", month = aug, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Thu Sep 7 16:13:26 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gulliver:2006:DUP, author = "Stephen R. Gulliver and Gheorghita Ghinea", title = "Defining user perception of distributed multimedia quality", journal = j-TOMCCAP, volume = "2", number = "4", pages = "241--257", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gopalan:2006:SAC, author = "Kartik Gopalan and Lan Huang and Gang Peng and Tzi-Cker Chiueh and Yow-Jian Lin", title = "Statistical admission control using delay distribution measurements", journal = j-TOMCCAP, volume = "2", number = "4", pages = "258--281", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2006:MSP, author = "H. Li and M. Li and B. Prabhakaran", title = "Middleware for streaming {$3$D} progressive meshes over lossy networks", journal = j-TOMCCAP, volume = "2", number = "4", pages = "282--317", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Etsion:2006:PPU, author = "Yoav Etsion and Dan Tsafrir and Dror G. Feitelson", title = "Process prioritization using output production: {Scheduling} for multimedia", journal = j-TOMCCAP, volume = "2", number = "4", pages = "318--342", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cesar:2006:GAH, author = "Pablo Cesar and Petri Vuorimaa and Juha Vierinen", title = "A graphics architecture for high-end interactive television terminals", journal = j-TOMCCAP, volume = "2", number = "4", pages = "343--357", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Madhwacharyula:2006:MHV, author = "Chitra L. Madhwacharyula and Marc Davis and Philippe Mulhem and Mohan S. Kankanhalli", title = "Metadata handling: a video perspective", journal = j-TOMCCAP, volume = "2", number = "4", pages = "358--388", month = nov, year = "2006", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Atrey:2007:GOO, author = "Pradeep K. Atrey and Mohan S. Kankanhalli and John B. Oommen", title = "Goal-oriented optimal subset selection of correlated multimedia streams", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2007:DSI, author = "Datong Chen and Jie Yang and Robert Malkin and Howard D. Wactlar", title = "Detecting social interactions of the elderly in a nursing home environment", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Heck:2007:VV, author = "Rachel Heck and Michael Wallick and Michael Gleicher", title = "Virtual videography", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Truong:2007:VAS, author = "Ba Tu Truong and Svetha Venkatesh", title = "Video abstraction: a systematic review and classification", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Xu:2007:CAD, author = "Changsheng Xu and Namunu C. Maddage and Xi Shao and Qi Tian", title = "Content-adaptive digital music watermarking based on music structure analysis", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yan:2007:MSO, author = "Wei-Qi Yan and Mohan S. Kankanhalli", title = "Multimedia simplification for optimized {MMS} synthesis", journal = j-TOMCCAP, volume = "3", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "????", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Apr 14 11:19:17 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2007:CAT, author = "Tiecheng Liu and John R. Kender", title = "Computational approaches to temporal sampling of video sequences", journal = j-TOMCCAP, volume = "3", number = "2", pages = "7:1--7:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230813", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video key frame extraction is one of the most important research problems for video summarization, indexing, and retrieval. For a variety of applications such as ubiquitous media access and video streaming, the temporal boundaries between video key frames are required for synchronizing visual content with audio. In this article, we define temporal video sampling as a unified process of extracting video key frames and computing their temporal boundaries, and formulate it as an optimization problem. We first provide an optimal approach that minimizes temporal video sampling error using a dynamic programming process. The optimal approach retrieves a key frame hierarchy and all temporal boundaries in $ O(n^4) $ time and $ O(n^2) $ space. To further reduce computational complexity, we also provide a suboptimal greedy algorithm that exploits the data structure of a binary heap and uses a novel ``look-ahead'' computational technique, enabling all levels of key frames to be extracted with an average-case computational time of $ O(n \log n) $ and memory usage of $ O(n) $. Both the optimal and the greedy methods are free of parameters, thus avoiding the threshold-selection problem that exists in other approaches. We empirically compare the proposed optimal and greedy methods with several existing methods in terms of video sampling error, computational cost, and subjective quality. An evaluation of eight videos of different genres shows that the greedy approach achieves performance very close to that of the optimal approach while drastically reducing computational cost, making it suitable for processing long video sequences in large video databases.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "key frame selection; temporal video sampling; ubiquitous media access; video content analysis; video summarization", } @Article{Moncrieff:2007:OAB, author = "Simon Moncrieff and Svetha Venkatesh and Geoff West", title = "Online audio background determination for complex audio environments", journal = j-TOMCCAP, volume = "3", number = "2", pages = "8:1--8:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230814", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present a method for foreground/background separation of audio using a background modelling technique. The technique models the background in an online, unsupervised, and adaptive fashion, and is designed for application to long term surveillance and monitoring problems. The background is determined using a statistical method to model the states of the audio over time. In addition, three methods are used to increase the accuracy of background modelling in complex audio environments. Such environments can cause the failure of the statistical model to accurately capture the background states. An entropy-based approach is used to unify background representations fragmented over multiple states of the statistical model. The approach successfully unifies such background states, resulting in a more robust background model. We adaptively adjust the number of states considered background according to background complexity, resulting in the more accurate classification of background models. Finally, we use an auxiliary model cache to retain potential background states in the system. This prevents the deletion of such states due to a rapid influx of observed states that can occur for highly dynamic sections of the audio signal. The separation algorithm was successfully applied to a number of audio environments representing monitoring applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "audio analysis; online background modelling; surveillance and monitoring", } @Article{Oshima:2007:PDS, author = "Chika Oshima and Kazushi Nishimoto and Norihiro Hagita", title = "A piano duo support system for parents to lead children to practice musical performances", journal = j-TOMCCAP, volume = "3", number = "2", pages = "9:1--9:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230815", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose ``Family Ensemble,'' a piano duo support system for a musically inept parent and his/her child who is a beginner at playing the piano. The system makes it easier for parents to correctly reproduce a given sequence of pitches along with the child's performance by using score tracking and note-replacement functions. The experiments with this support system showed that the parents can immediately participate in the piano duo. Furthermore, we found that during joint practices using Family Ensemble some subjects discussed musical ideas that they would not have talked about without using the system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "entertainment; musical expression; piano duo; score tracking; support system", } @Article{He:2007:CSW, author = "Xiaofei He and Deng Cai and Ji-Rong Wen and Wei-Ying Ma and Hong-Jiang Zhang", title = "Clustering and searching {WWW} images using link and page layout analysis", journal = j-TOMCCAP, volume = "3", number = "2", pages = "10:1--10:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230816", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Due to the rapid growth of the number of digital images on the Web, there is an increasing demand for an effective and efficient method for organizing and retrieving the available images. This article describes iFind, a system for clustering and searching WWW images. By using a vision-based page segmentation algorithm, a Web page is partitioned into blocks, and the textual and link information of an image can be accurately extracted from the block containing that image. The textual information is used for image indexing. By extracting the page-to-block, block-to-image, block-to-page relationships through link structure and page layout analysis, we construct an image graph. Our method is less sensitive to noisy links than previous methods like PageRank, HITS, and PicASHOW, and hence the image graph can better reflect the semantic relationship between images. Using the notion of Markov Chain, we can compute the limiting probability distributions of the images, ImageRanks, which characterize the importance of the images. The ImageRanks are combined with the relevance scores to produce the final ranking for image search. With the graph models, we can also use techniques from spectral graph theory for image clustering and embedding, or 2-D visualization. Some experimental results on 11.6 million images downloaded from the Web are provided in the article.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "image clustering; image search; link analysis; Web mining", } @Article{Jung:2007:NBA, author = "Byunghee Jung and Junehwa Song and Yoonjoon Lee", title = "A narrative-based abstraction framework for story-oriented video", journal = j-TOMCCAP, volume = "3", number = "2", pages = "11:1--11:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230817", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes a novel video abstraction framework for online review services of story-oriented videos such as dramas. Among the many genres of TV programs, a drama is one of the most popularly watched on the Web. The abstracts generated by the proposed framework not only give a summary of a video but also effectively help viewers understand the overall story. In addition, our method is duration-flexible. We get clues about human understanding of a story from scenario writing rules and editorial techniques that are popularly used in the process of video production to explicitly express a narrative, and propose a new video abstraction model, called a Narrative Abstraction Model. The model effectively captures the narrative structure embedded in a story-oriented video and articulates the progress of the story in a weighted directed graph, called a Narrative Structure Graph (NSG). The model provides a basis for a flexible framework for abstract generation using the NSG as the intermediary representation of a video. Different abstracts can be appropriately generated based upon different user requirements. To show the effectiveness of the proposed model and method, we developed a video abstraction system realizing the framework, and successfully applied it to large volumes of TV dramas. The evaluation results show that the proposed framework is a feasible solution for online review services.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "film; narrative structure; online review services; story understanding; story-oriented; video abstraction; video abstraction system", } @Article{Shacham:2007:UDP, author = "Ron Shacham and Henning Schulzrinne and Srisakul Thakolsri and Wolfgang Kellerer", title = "Ubiquitous device personalization and use: {The} next generation of {IP} multimedia communications", journal = j-TOMCCAP, volume = "3", number = "2", pages = "12:1--12:??", month = may, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1230812.1230818", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:04 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Service usage in emerging ubiquitous environments includes seamless and personalized usage of public and private devices discovered in the vicinity of a user. In our work, we describe an architecture for device discovery, device configuration, and the transfer of active sessions between devices. The presented architecture uses the Session Initiation Protocol (SIP) as a standardized, widely used signaling protocol for IP-based multimedia services. Our solution includes support of simple existing devices, split of sessions between devices, user-control of location-based behavior, and handling of security and privacy concerns. We present the implementation and show the feasibility of our work with analytical evaluation and measurements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Internet multimedia; location-based services; mobile communications; ubiquitous computing", } @Article{Chen:2007:EMO, author = "Herng-Yow Chen and Sheng-Wei Li", title = "Exploring many-to-one speech-to-text correlation for {Web}-based language learning", journal = j-TOMCCAP, volume = "3", number = "3", pages = "13:1--13:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236472", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article investigates the correlations between multimedia objects (particularly speech and text) involved in language lectures in order to design an effective presentation mechanism for web-based learning. The cross-media correlations are classified into implicit relations (retrieved by computing) and explicit relations (recorded during the preprocessing stage). The implicit temporal correlation between speech and text is primarily to help to negotiate supplementary lecture navigations like tele-pointer movement, lips-sync movement, and content scrolling. We propose a speech-text alignment framework, using an iterative algorithm based on local alignment, to probe many-to-one temporal correlations, and not the one-to-one only. The proposed framework is a more practical method for analyzing general language lectures, and the algorithm's time complexity conforms to the best-possible computation cost, O(nm), without introducing additional computation. In addition, we have shown the feasibility of creating vivid presentations by exploiting implicit relations and artificially simulating some explicit media. To facilitate the navigation of integrated multimedia documents, we develop several visualization techniques for describing media correlations, including guidelines for speech-text correlations, visible-automatic scrolling, and levels of detail of timeline, to provide intuitive and easy-to-use random access mechanisms. We evaluated the performance of the analysis method and human perceptions of the synchronized presentation. The overall performance of the analysis method is that about 99.5\% of the words analyzed are of a temporal error within 0.5 sec and the subjective evaluation result shows that the synchronized presentation is highly acceptable to human beings.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "analysis and presentation; computed synchronization; cross-media correlation; lips sync; speech-to-text alignment", } @Article{Wang:2007:EST, author = "Surong Wang and Manoranjan Dash and Liang-Tien Chia and Min Xu", title = "Efficient sampling of training set in large and noisy multimedia data", journal = j-TOMCCAP, volume = "3", number = "3", pages = "14:1--14:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236473", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As the amount of multimedia data is increasing day-by-day thanks to less expensive storage devices and increasing numbers of information sources, machine learning algorithms are faced with large-sized and noisy datasets. Fortunately, the use of a good sampling set for training influences the final results significantly. But using a simple random sample (SRS) may not obtain satisfactory results because such a sample may not adequately represent the large and noisy dataset due to its blind approach in selecting samples. The difficulty is particularly apparent for huge datasets where, due to memory constraints, only very small sample sizes are used. This is typically the case for multimedia applications, where data size is usually very large. In this article we propose a new and efficient method to sample of large and noisy multimedia data. The proposed method is based on a simple distance measure that compares the histograms of the sample set and the whole set in order to estimate the representativeness of the sample. The proposed method deals with noise in an elegant manner which SRS and other methods are not able to deal with. We experiment on image and audio datasets. Comparison with SRS and other methods shows that the proposed method is vastly superior in terms of sample representativeness, particularly for small sample sizes although time-wise it is comparable to SRS, the least expensive method in terms of time.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "audio event identification; histogram; image classification; noise; sampling", } @Article{Zhou:2007:CCO, author = "Suiping Zhou and Wentong Cai and Stephen J. Turner and Bu-Sung Lee and Junhu Wei", title = "Critical causal order of events in distributed virtual environments", journal = j-TOMCCAP, volume = "3", number = "3", pages = "15:1--15:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236474", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We investigate the causal order of events in distributed virtual environments (DVEs). We first define the critical causal order relation among the events. Then, we propose some mechanisms to enhance the prevalent RO (receive order delivery) mechanism in DVEs so that the real-time property of DVEs is preserved while the critical causal order violations are reduced. These mechanisms are implemented as a middleware. Experimental results show that the middleware performs well in reducing the critical causality violations in simulation and incurs little processing overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "causal order; distributed simulation; virtual environments", } @Article{Li:2007:SRM, author = "Chuanjun Li and S. Q. Zheng and B. Prabhakaran", title = "Segmentation and recognition of motion streams by similarity search", journal = j-TOMCCAP, volume = "3", number = "3", pages = "16:1--16:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236475", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Fast and accurate recognition of motion data streams from gesture sensing and motion capture devices has many applications and is the focus of this article. Based on the analysis of the geometric structures revealed by singular value decompositions (SVD) of motion data, a similarity measure is proposed for simultaneously segmenting and recognizing motion streams. A direction identification approach is explored to further differentiate motions with similar data geometric structures. Experiments show that the proposed similarity measure can segment and recognize motion streams of variable lengths with high accuracy, without knowing beforehand the number of motions in a stream.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "gesture recognition; motion capture; pattern analysis; principal component analysis; segmentation; similarity measures; singular value decomposition", } @Article{Ott:2007:OAT, author = "David E. Ott and Ketan Mayer-Patel", title = "An open architecture for transport-level protocol coordination in distributed multimedia applications", journal = j-TOMCCAP, volume = "3", number = "3", pages = "17:1--17:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236476", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We consider the problem of flow coordination in distributed multimedia applications. Most transport-level protocols are designed to operate independently and lack mechanisms for sharing information with other flows and coordinating data transport in various ways. This limitation becomes problematic in distributed applications that employ numerous flows between two computing clusters sharing the same intermediary forwarding path across the Internet. In this article, we propose an open architecture that supports the sharing of network state information, peer flow information, and application-specific information. Called simply the coordination protocol (CP), the scheme facilitates coordination of network resource usage across flows belonging to the same application, as well as aiding other types of coordination. The effectiveness of our approach is illustrated in the context of multistreaming in 3D tele-immersion where consistency of network information across flows both greatly improves frame transport synchrony and minimizes buffering delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "distributed applications; flow coordination; network protocols", } @Article{Sakr:2007:RCB, author = "Ziad Sakr and Nicolas D. Georganas", title = "Robust content-based {MPEG}-4 {XMT} scene structure authentication and multimedia content location", journal = j-TOMCCAP, volume = "3", number = "3", pages = "18:1--18:??", month = aug, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1236471.1236477", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:10:32 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For the past decade, there have been numerous research works focusing on the protection of digital images, audio, video, 3D virtual scenes, and software data from unauthorized use and distribution. With the emerging technology of the MPEG-4 standard, MPEG-4 scenes that may include images, video, audio, and 3D objects can easily be built using the text-based MPEG-4 XMT standard. XMT allows content authors to exchange their content with other authors, tools, or service providers and facilitates interoperability with MPEG-4, X3D, and SMIL. In order for owners and designers to protect and/or authenticate their work, some form of security needs to be applied into the MPEG-4 XMT structure and its media content. Unlike images or videos, watermarking an XMT structure is not an easy task, since the structure contains no noise components to embed the watermark. This article is the first one proposing a novel robust algorithm for the authentication of a given MPEG-4 XMT structured scene and the location of its multimedia content.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "MPEG-4; multimedia; polynomial; pseudorandom sequences; steganography; VRML; watermarking; XML; XMT", } @Article{Ghinea:2007:ISI, author = "Gheorghita Ghinea and Chabane Djeraba and Stephen Gulliver and Kara Pernice Coyne", title = "Introduction to special issue on eye-tracking applications in multimedia systems", journal = j-TOMCCAP, volume = "3", number = "4", pages = "1:1--1:4", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314304", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Colombo:2007:RTR, author = "Carlo Colombo and Dario Comanducci and Alberto {Del Bimbo}", title = "Robust tracking and remapping of eye appearance with passive computer vision", journal = j-TOMCCAP, volume = "3", number = "4", pages = "2:1--2:20", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314305", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A single-camera iris-tracking and remapping approach based on passive computer vision is presented. Tracking is aimed at obtaining accurate and robust measurements of the iris/pupil position. To this purpose, a robust method for ellipse fitting is used, employing search constraints so as to achieve better performance with respect to the standard RANSAC algorithm. Tracking also embeds an iris localization algorithm (working as a bootstrap multiple-hypotheses generation step), and a blink detector that can detect voluntary eye blinks in human-computer interaction applications. On-screen remapping incorporates a head-tracking method capable of compensating for small user-head movements. The approach operates in real time under different light conditions and in the presence of distractors. An extensive set of experiments is presented and discussed. In particular, an evaluation method for the choice of layout of both hardware components and calibration points is described. Experiments also investigate the importance of providing a visual feedback to the user, and the benefits gained from performing head compensation, especially during image-to-screen map calibration.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "eye blink detection; eye tracking and remapping; eye-driven human-computer interaction; robust fitting", } @Article{Wang:2007:UGP, author = "Jun Wang and Lijun Yin and Jason Moore", title = "Using geometric properties of topographic manifold to detect and track eyes for human-computer interaction", journal = j-TOMCCAP, volume = "3", number = "4", pages = "3:1--3:20", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314306", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Automatic eye detection and tracking is an important component for advanced human-computer interface design. Accurate eye localization can help develop a successful system for face recognition and emotion identification. In this article, we propose a novel approach to detect and track eyes using geometric surface features on topographic manifold of eye images. First, in the joint spatial-intensity domain, a facial image is treated as a 3D terrain surface or image topographic manifold. In particular, eye regions exhibit certain intrinsic geometric traits on this topographic manifold, namely, the pit -labeled center and hillside -like surround regions. Applying a terrain classification procedure on the topographic manifold of facial images, each location of the manifold can be labeled to generate a terrain map. We use the distribution of terrain labels to represent the eye terrain pattern. The Bhattacharyya affinity is employed to measure the distribution similarity between two topographic manifolds. Based on the Bhattacharyya kernel, a support vector machine is applied for selecting proper eye pairs from the pit-labeled candidates. Second, given detected eyes on the first frame of a video sequence, a mutual-information-based fitting function is defined to describe the similarity between two terrain surfaces of neighboring frames. By optimizing the fitting function, eye locations are updated for subsequent frames. The distinction of the proposed approach lies in that both eye detection and eye tracking are performed on the derived topographic manifold, rather than on an original-intensity image domain. The robustness of the approach is demonstrated under various imaging conditions and with different facial appearances, using both static images and video sequences without background constraints.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Bhattacharyya affinity; eye detection; eye tracking; mutual information; topographic manifold", } @Article{Agrafiotis:2007:TEC, author = "D. Agrafiotis and S. J. C. Davies and N. Canagarajah and D. R. Bull", title = "Towards efficient context-specific video coding based on gaze-tracking analysis", journal = j-TOMCCAP, volume = "3", number = "4", pages = "4:1--4:15", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314307", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article discusses a framework for model-based, context-dependent video coding based on exploitation of characteristics of the human visual system. The system utilizes variable-quality coding based on priority maps which are created using mostly context-dependent rules. The technique is demonstrated through two case studies of specific video context, namely open signed content and football sequences. Eye-tracking analysis is employed for identifying the characteristics of each context, which are subsequently exploited for coding purposes, either directly or through a gaze prediction model. The framework is shown to achieve a considerable improvement in coding efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "applications; context-based video coding; eye tracking; multimedia perceptual quality; subjective video quality; transformation of eye movements into useful knowledge", } @Article{Urruty:2007:DEF, author = "Thierry Urruty and Stanislas Lew and Nacim Ihadaddene and Dan A. Simovici", title = "Detecting eye fixations by projection clustering", journal = j-TOMCCAP, volume = "3", number = "4", pages = "5:1--5:20", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314308", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Eye movements are certainly the most natural and repetitive movement of a human being. The most mundane activity, such as watching television or reading a newspaper, involves this automatic activity which consists of shifting our gaze from one point to another.\par Identification of the components of eye movements (fixations and saccades) is an essential part in the analysis of visual behavior because these types of movements provide the basic elements used by further investigations of human vision.\par However, many of the algorithms that detect fixations present a number of problems. In this article, we present a new fixation identification technique that is based on clustering of eye positions, using projections and projection aggregation applied to static pictures. We also present a new method that computes dispersion of eye fixations in videos considering a multiuser environment.\par To demonstrate the performance and usefulness of our approach we discuss our experimental work with two different applications: on fixed image and video.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "eye fixations; interaction modeling; projected clustering; static pictures; videos", } @Article{Duchowski:2007:FGC, author = "Andrew T. Duchowski and Arzu {\c{C}}{\"o}ltekin", title = "Foveated gaze-contingent displays for peripheral {LOD} management, {$3$D} visualization, and stereo imaging", journal = j-TOMCCAP, volume = "3", number = "4", pages = "6:1--6:18", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314309", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Advancements in graphics hardware have allowed development of hardware-accelerated imaging displays. This article reviews techniques for real-time simulation of arbitrary visual fields over still images and video. The goal is to provide the vision sciences and perceptual graphics communities techniques for the investigation of fundamental processes of visual perception. Classic gaze-contingent displays used for these purposes are reviewed and for the first time a pixel shader is introduced for display of a high-resolution window over peripherally degraded stimulus. The pixel shader advances current state-of-the-art by allowing real-time processing of still or streamed images, obviating the need for preprocessing or storage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "eye tracking; foveation; gaze-contingent displays; level-of-detail", } @Article{Loschky:2007:HLC, author = "Lester C. Loschky and Gary S. Wolverton", title = "How late can you update gaze-contingent multiresolutional displays without detection?", journal = j-TOMCCAP, volume = "3", number = "4", pages = "7:1--7:10", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314310", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study investigated perceptual disruptions in gaze-contingent multiresolutional displays (GCMRDs) due to delays in updating the center of highest resolution after an eye movement. GCMRDs can be used to save processing resources and transmission bandwidth in many types of single-user display applications, such as virtual reality, video-telephony, simulators, and remote piloting. The current study found that image update delays as late as 60 ms after an eye movement did not significantly increase the detectability of image blur and/or motion transients due to the update. This is good news for designers of GCMRDs, since 60 ms is ample time to update many GCMRDs after an eye movement without disrupting perception. The study also found that longer eye movements led to greater blur and/or transient detection due to moving the eyes further into the low-resolution periphery, effectively reducing the image resolution at fixation prior to the update. In GCMRD applications where longer saccades are more likely (e.g., displays with relatively large distances between objects), this problem could be overcome by increasing the size of the region of highest resolution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "area of interest; bandwidth; blur detection; contrast thresholds; display updates; eye movements; eye tracking; foveated; foveation; gaze-contingent; level-of-detail; multiresolution; perceptual compression; peripheral vision; saccades; saccadic suppression; visual perception", } @Article{Murray:2007:AEG, author = "Norman Murray and Dave Roberts and Anthony Steed and Paul Sharkey and Paul Dickerson and John Rae", title = "An assessment of eye-gaze potential within immersive virtual environments", journal = j-TOMCCAP, volume = "3", number = "4", pages = "8:1--8:17", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314311", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In collaborative situations, eye gaze is a critical element of behavior which supports and fulfills many activities and roles. In current computer-supported collaboration systems, eye gaze is poorly supported. Even in a state-of-the-art video conferencing system such as the access grid, although one can see the face of the user, much of the communicative power of eye gaze is lost. This article gives an overview of some preliminary work that looks towards integrating eye gaze into an immersive collaborative virtual environment and assessing the impact that this would have on interaction between the users of such a system. Three experiments were conducted to assess the efficacy of eye gaze within immersive virtual environments. In each experiment, subjects observed on a large screen the eye-gaze behavior of an avatar. The eye-gaze behavior of that avatar had previously been recorded from a user with the use of a head-mounted eye tracker. The first experiment was conducted to assess the difference between users' abilities to judge what objects an avatar is looking at with only head gaze being viewed and also with eye- and head-gaze data being displayed. The results from the experiment show that eye gaze is of vital importance to the subjects, correctly identifying what a person is looking at in an immersive virtual environment. The second experiment examined whether a monocular or binocular eye-tracker would be required. This was examined by testing subjects' ability to identify where an avatar was looking from their eye direction alone, or by eye direction combined with convergence. This experiment showed that convergence had a significant impact on the subjects' ability to identify where the avatar was looking. The final experiment looked at the effects of stereo and mono-viewing of the scene, with the subjects being asked to identify where the avatar was looking. This experiment showed that there was no difference in the subjects' ability to detect where the avatar was gazing. This is followed by a description of how the eye-tracking system has been integrated into an immersive collaborative virtual environment and some preliminary results from the use of such a system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "eye gaze; immersive virtual environments", } @Article{Rachovides:2007:CIM, author = "Dorothy Rachovides and James Walkerdine and Peter Phillips", title = "The conductor interaction method", journal = j-TOMCCAP, volume = "3", number = "4", pages = "9:1--9:23", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1314303.1314312", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:11:20 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Computers have increasingly become part of our everyday lives, with many activities either involving their direct use or being supported by one. This has prompted research into developing methods and mechanisms to assist humans in interacting with computers (human-computer interaction, or HCI). A number of HCI techniques have been developed over the years, some of which are quite old but continue to be used, and some more recent and still evolving. Many of these interaction techniques, however, are not natural in their use and typically require the user to learn a new means of interaction. Inconsistencies within these techniques and the restrictions they impose on user creativity can also make such interaction techniques difficult to use, especially for novice users.\par This article proposes an alternative interaction method, the conductor interaction method (CIM), which aims to provide a more natural and easier-to-learn interaction technique. This novel interaction method extends existing HCI methods by drawing upon techniques found in human-human interaction. It is argued that the use of a two-phased multimodal interaction mechanism, using gaze for selection and gesture for manipulation, incorporated within a metaphor-based environment, can provide a viable alternative for interacting with a computer (especially for novice users). Both the model and an implementation of the CIM within a system are presented in this article. This system formed the basis of a number of user studies that have been performed to assess the effectiveness of the CIM, the findings of which are discussed in this work.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "gaze- and gesture-based interfaces; human-computer interaction", } @Article{Luo:2008:IFH, author = "Hangzai Luo and Yuli Gao and Xiangyang Xue and Jinye Peng and Jianping Fan", title = "Incorporating feature hierarchy and boosting to achieve more effective classifier training and concept-oriented video summarization and skimming", journal = j-TOMCCAP, volume = "4", number = "1", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For online medical education purposes, we have developed a novel scheme to incorporate the results of semantic video classification to select the most representative video shots for generating concept-oriented summarization and skimming of surgery education videos. First, salient objects are used as the video patterns for feature extraction to achieve a good representation of the intermediate video semantics. The salient objects are defined as the salient video compounds that can be used to characterize the most significant perceptual properties of the corresponding real world physical objects in a video, and thus the appearances of such salient objects can be used to predict the appearances of the relevant semantic video concepts in a specific video domain. Second, a novel multi-modal boosting algorithm is developed to achieve more reliable video classifier training by incorporating feature hierarchy and boosting to dramatically reduce both the training cost and the size of training samples, thus it can significantly speed up SVM (support vector machine) classifier training. In addition, the unlabeled samples are integrated to reduce the human efforts on labeling large amount of training samples. Finally, the results of semantic video classification are incorporated to enable concept-oriented video summarization and skimming. Experimental results in a specific domain of surgery education videos are provided.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "concept-oriented video skimming; feature hierarchy; multi-modal boosting; salient objects; semantic video classification; unlabeled samples", } @Article{Hefeeda:2008:RDO, author = "Mohamed Hefeeda and Cheng-Hsin Hsu", title = "Rate-distortion optimized streaming of fine-grained scalable video sequences", journal = j-TOMCCAP, volume = "4", number = "1", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324289", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present optimal schemes for allocating bits of fine-grained scalable video sequences among multiple senders streaming to a single receiver. This allocation problem is critical in optimizing the perceived quality in peer-to-peer and distributed multi-server streaming environments. Senders in such environments are heterogeneous in their outgoing bandwidth and they hold different portions of the video stream. We first formulate and optimally solve the problem for individual frames, then we generalize to the multiple frame case. Specifically, we formulate the allocation problem as an optimization problem, which is nonlinear in general. We use rate-distortion models in the formulation to achieve the minimum distortion in the rendered video, constrained by the outgoing bandwidth of senders, availability of video data at senders, and incoming bandwidth of receiver. We show how the adopted rate-distortion models transform the nonlinear problem to an integer linear programming (ILP) problem. We then design a simple rounding scheme that transforms the ILP problem to a linear programming (LP) one, which can be solved efficiently using common optimization techniques such as the Simplex method. We prove that our rounding scheme always produces a feasible solution, and the solution is within a negligible margin from the optimal solution. We also propose a new algorithm (FGSAssign) for the single-frame allocation problem that runs in $ O(n \log n) $ steps, where n is the number of senders. We prove that FGSAssign is optimal. Furthermore, we propose a heuristic algorithm (mFGSAssign) that produces near-optimal solutions for the multiple-frame case, and runs an order of magnitude faster than the optimal one. Because of its short running time, mFGSAssign can be used in real time. Our experimental study validates our analytical analysis and shows the effectiveness of our allocation algorithms in improving the video quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "distributed streaming; FGS; fine-grained scalable streaming; peer-to-peer streaming; rate-distortion models; rate-distortion optimized streaming; video streaming", } @Article{Babich:2008:VQE, author = "Fulvio Babich and Marco D'orlando and Francesca Vatta", title = "Video quality estimation in wireless {IP} networks: {Algorithms} and applications", journal = j-TOMCCAP, volume = "4", number = "1", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324290", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes three methods to estimate the distortion deriving from packet losses in wireless video communication. The proposed methods take into account the short-term properties of the encoded video sequences. A suitable set of functions is adopted to model the distortion envelope resulting from multiple losses. The estimated performance is compared with the actual distortion, evaluated by decoding the received sequence with a properly designed decoder. Numerical results confirm the accuracy of the proposed models in approximating the actual Mean Square Error (MSE) for a wide range of loss rates. Some applications of the proposed algorithms are presented.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "distortion estimation; error-concealment; error-resilience; H.264; packet loss rate; real time video; wireless networks", } @Article{Kotharu:2008:PQR, author = "Phani S. Kotharu and B. Prabhakaran", title = "Partial query resolution for animation authoring", journal = j-TOMCCAP, volume = "4", number = "1", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324291", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Animations are a part of multimedia and techniques such as motion mapping and inverse kinematics aid in reusing models and motion sequences to create new animations. This reuse approach is facilitated by the use of content-based retrieval techniques that often require fuzzy query resolution. Most fuzzy query resolution approaches work on all the attributes of the query to minimize the database access cost thus resulting in an unsatisfactory result set. It turns out that the query resolution can be carried out in a partial manner to achieve user satisfactory results and aid in easy authoring. In this article, we present two partial fuzzy query resolution approaches, one that results in high-quality animations and the other that produces results with decreasing number of satisfied conditions in the query.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "aggregation function; animation toolkit; fuzzy query; multimedia authoring; partial ordering; top-k query", } @Article{Ip:2008:RRS, author = "Alan T. S. Ip and John C. S. Lui and Jiangchuan Liu", title = "A revenue-rewarding scheme of providing incentive for cooperative proxy caching for media streaming systems", journal = j-TOMCCAP, volume = "4", number = "1", pages = "5:1--5:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324292", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Network entities cooperating together can improve system performance of media streaming. In this paper, we address the ``incentive issue'' of a cooperative proxy caching system and how to motivate each proxy to provide cache space to the system. To encourage proxies to participate, we propose a ``revenue-rewarding scheme'' to credit the cooperative proxies according to the resources they contribute. A game-theoretic model is used to analyze the interactions among proxies under the revenue-rewarding scheme. We propose two cooperative game settings that lead to optimal situations. In particular, (1) We propose a distributed incentive framework for peers to participate in resource contribution for media streaming; (2) Proxies are encouraged to cooperate under the revenue-rewarding scheme; (3) Profit and social welfare are maximized in these cooperative games; and (4) Cost-effective resource allocation is achieved in these cooperative games. Large scale simulation is carried out to validate and verify the merits of our proposed incentive schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "game-theoretic analysis; incentive mechanism; Nash equilibrium; pricing; resource allocation", } @Article{Zhang:2008:AEE, author = "Cha Zhang and Yong Rui and Jim Crawford and Li-Wei He", title = "An automated end-to-end lecture capture and broadcasting system", journal = j-TOMCCAP, volume = "4", number = "1", pages = "6:1--6:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324293", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Remote viewing of lectures presented to a live audience is becoming increasingly popular. At the same time, the lectures can be recorded for subsequent on-demand viewing over the Internet. Providing such services, however, is often prohibitive due to the labor-intensive cost of capturing and pre/post-processing. This article presents a complete automated end-to-end system that supports capturing, broadcasting, viewing, archiving and searching of presentations. Specifically, we describe a system architecture that minimizes the pre- and post-production time, and a fully automated lecture capture system called iCam2 that synchronously captures all contents of the lecture, including audio, video, and presentation material. No staff is needed during lecture capture and broadcasting, so the operational cost of the system is negligible. The system has been used on a daily basis for more than 4 years, during which 522 lectures have been captured. These lectures have been viewed over 20,000 times.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "automated lecture capture; lecture broadcasting; live/on-demand broadcasting", } @Article{Nguyen:2008:OIV, author = "Giang Phuong Nguyen and Marcel Worring", title = "Optimization of interactive visual-similarity-based search", journal = j-TOMCCAP, volume = "4", number = "1", pages = "7:1--7:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324294", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "At one end of the spectrum, research in interactive content-based retrieval concentrates on machine learning methods for effective use of relevance feedback. On the other end, the information visualization community focuses on effective methods for conveying information to the user. What is lacking is research considering the information visualization and interactive retrieval as truly integrated parts of one content-based search system. In such an integrated system, there are many degrees of freedom like the similarity function, the number of images to display, the image size, different visualization modes, and possible feedback modes. To base the optimal values for all of those on user studies is unfeasible. We therefore develop search scenarios in which tasks and user actions are simulated. From there, the proposed scheme is optimized based on objective constraints and evaluation criteria. In such a manner, the degrees of freedom are reduced and the remaining degrees can be evaluated in user studies. In this article, we present a system that integrates advanced similarity based visualization with active learning. We have performed extensive experimentation on interactive category search with different image collections. The results using the proposed simulation scheme show that indeed the use of advanced visualization and active learning pays off in all of these datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "active learning; interactive search; similarity based visualization", } @Article{Hlavacs:2008:HVP, author = "Helmut Hlavacs and Shelley Buchinger", title = "Hierarchical video patching with optimal server bandwidth", journal = j-TOMCCAP, volume = "4", number = "1", pages = "8:1--8:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1324287.1324295", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:06 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video patching is a way for transporting true video-on-demand, that is, instantaneous without any delay, from a video server to several clients. Instead of sending a unique stream to each newly arriving client, clients share as many multicast transmissions as possible, and are serviced only those parts of the video that they have missed.\par We present a novel video patching scheme using hierarchies of patches. Our scheme minimizes the bandwidth needed by the video server, and may result in the fact that clients receive several streams in parallel. We show analytically that for Poisson arrival our algorithm achieves the optimal possible server bandwidth for all schemes where clients share multicast transmissions.\par We also show, how our approach can be combined with batching. This combination requires less server bandwidth than all fixed start point periodic broadcast algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "batching; server bandwidth; true video-on-demand; video patching", } @Article{Chen:2008:ASD, author = "Songqing Chen and Shiping Chen and Huiping Guo and Bo Shen and Sushil Jajodia", title = "Achieving simultaneous distribution control and privacy protection for {Internet} media delivery", journal = j-TOMCCAP, volume = "4", number = "2", pages = "9:1--9:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352013", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Massive Internet media distribution demands prolonged continuous consumption of networking and disk bandwidths in large capacity. Many proxy-based Internet media distribution algorithms and systems have been proposed, implemented, and evaluated to address the scalability and performance issue. However, few of them have been used in practice, since two important issues are not satisfactorily addressed. First, existing proxy-based media distribution architectures lack an efficient media distribution control mechanism. Without copyright protection, content providers are hesitant to use proxy-based fast distribution techniques. Second, little has been done to protect client privacy during content accesses on the Internet. Straightforward solutions to address these two issues independently lead to conflicts. For example, to enforce distribution control, only legitimate users should be granted access rights. However, this normally discloses more information (such as which object the client is accessing) other than the client identity, which conflicts with the client's desire for privacy protection. In this article, we propose a unified proxy-based media distribution protocol to effectively address these two problems simultaneously. We further design a set of new algorithms in a cooperative proxy environment where our proposed scheme works efficiently and practically. Simulation-based experiments are conducted to extensively evaluate the proposed system. Preliminary results demonstrate the effectiveness of our proposed strategy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "cooperative proxy; distribution control; media delivery; privacy; proxy caching", } @Article{Li:2008:FSE, author = "Rui Li and Bir Bhanu and Anlei Dong", title = "Feature synthesized {EM} algorithm for image retrieval", journal = j-TOMCCAP, volume = "4", number = "2", pages = "10:1--10:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352014", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As a commonly used unsupervised learning algorithm in Content-Based Image Retrieval (CBIR), Expectation-Maximization (EM) algorithm has several limitations, including the curse of dimensionality and the convergence at a local maximum. In this article, we propose a novel learning approach, namely Coevolutionary Feature Synthesized Expectation-Maximization (CFS-EM), to address the above problems. The CFS-EM is a hybrid of coevolutionary genetic programming (CGP) and EM algorithm applied on partially labeled data. CFS-EM is especially suitable for image retrieval because the images can be searched in the synthesized low-dimensional feature space, while a kernel-based method has to make classification computation in the original high-dimensional space. Experiments on real image databases show that CFS-EM outperforms Radial Basis Function Support Vector Machine (RBF-SVM), CGP, Discriminant-EM (D-EM) and Transductive-SVM (TSVM) in the sense of classification performance and it is computationally more efficient than RBF-SVM in the query phase.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "coevolutionary feature synthesis; content-based image retrieval; expectation maximization; semi-supervised learning", } @Article{Xu:2008:AKG, author = "Min Xu and Changsheng Xu and Lingyu Duan and Jesse S. Jin and Suhuai Luo", title = "Audio keywords generation for sports video analysis", journal = j-TOMCCAP, volume = "4", number = "2", pages = "11:1--11:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352015", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sports video has attracted a global viewership. Research effort in this area has been focused on semantic event detection in sports video to facilitate accessing and browsing. Most of the event detection methods in sports video are based on visual features. However, being a significant component of sports video, audio may also play an important role in semantic event detection. In this paper, we have borrowed the concept of the ``keyword'' from the text mining domain to define a set of specific audio sounds. These specific audio sounds refer to a set of game-specific sounds with strong relationships to the actions of players, referees, commentators, and audience, which are the reference points for interesting sports events. Unlike low-level features, audio keywords can be considered as a mid-level representation, able to facilitate high-level analysis from the semantic concept point of view. Audio keywords are created from low-level audio features with learning by support vector machines. With the help of video shots, the created audio keywords can be used to detect semantic events in sports video by Hidden Markov Model (HMM) learning. Experiments on creating audio keywords and, subsequently, event detection based on audio keywords have been very encouraging. Based on the experimental results, we believe that the audio keyword is an effective representation that is able to achieve satisfying results for event detection in sports video. Application in three sports types demonstrates the practicality of the proposed method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "audio keywords; event detection; semantics analysis; sports video analysis; support vector machines", } @Article{Tullimas:2008:MSU, author = "Sunand Tullimas and Thinh Nguyen and Rich Edgecomb and Sen-ching Cheung", title = "Multimedia streaming using multiple {TCP} connections", journal = j-TOMCCAP, volume = "4", number = "2", pages = "12:1--12:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352016", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recent years, multimedia applications over the Internet become increasingly popular. However, packet loss, delay, and time-varying bandwidth of the Internet have remained the major problems for multimedia streaming applications. As such, a number of approaches, including network infrastructure and protocol, source and channel coding, have been proposed to either overcome or alleviate these drawbacks of the Internet. In this article, we propose the MultiTCP system, a receiver-driven, TCP-based system for multimedia streaming over the Internet. Our proposed algorithm aims at providing resilience against short term insufficient bandwidth by using multiple TCP connections for the same application. Our proposed system enables the application to achieve and control the desired sending rate during congested periods, which cannot be achieved using traditional TCP. Finally, our proposed system is implemented at the application layer, and hence, no kernel modification to TCP is necessary. We analyze the proposed system, and present simulation and experimental results to demonstrate its advantages over the traditional single-TCP-based approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "multimedia streaming", } @Article{Tjondronegoro:2008:SES, author = "Dian Tjondronegoro and Yi-Ping Phoebe Chen and Adrien Joly", title = "A scalable and extensible segment-event-object-based sports video retrieval system", journal = j-TOMCCAP, volume = "4", number = "2", pages = "13:1--13:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352017", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sport video data is growing rapidly as a result of the maturing digital technologies that support digital video capture, faster data processing, and large storage. However, (1) semi-automatic content extraction and annotation, (2) scalable indexing model, and (3) effective retrieval and browsing, still pose the most challenging problems for maximizing the usage of large video databases. This article will present the findings from a comprehensive work that proposes a scalable and extensible sports video retrieval system with two major contributions in the area of sports video indexing and retrieval. The first contribution is a new sports video indexing model that utilizes semi-schema-based indexing scheme on top of an Object-Relationship approach. This indexing model is scalable and extensible as it enables gradual index construction which is supported by ongoing development of future content extraction algorithms. The second contribution is a set of novel queries which are based on XQuery to generate dynamic and user-oriented summaries and event structures. The proposed sports video retrieval system has been fully implemented and populated with soccer, tennis, swimming, and diving video. The system has been evaluated against 20 users to demonstrate and confirm its feasibility and benefits. The experimental sports genres were specifically selected to represent the four main categories of sports domain: period-, set-point-, time (race)-, and performance-based sports. Thus, the proposed system should be generic and robust for all types of sports.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "automatic content extraction; indexing; mobile video interaction; MPEG-7; sports video retrieval; video database system; XML; XQuery", } @Article{Zimmermann:2008:DMP, author = "Roger Zimmermann and Elaine Chew and Sakire Arslan Ay and Moses Pawar", title = "Distributed musical performances: {Architecture} and stream management", journal = j-TOMCCAP, volume = "4", number = "2", pages = "14:1--14:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352018", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "An increasing number of novel applications produce a rich set of different data types that need to be managed efficiently and coherently. In this article we present our experience with designing and implementing a data management infrastructure for a distributed immersive performance (DIP) application. The DIP project investigates a versatile framework for the capture, recording, and replay of video, audio, and MIDI (Musical Instrument Digital Interface) streams in an interactive environment for collaborative music performance. We are focusing on two classes of data streams that are generated within this environment. The first category consists of high-resolution isochronous media streams, namely audio and video. The second class comprises MIDI data produced by electronic instruments. MIDI event sequences are alphanumeric in nature and fall into the category of the data streams that have been of interest to data management researchers in recent years.\par We present our data management architecture, which provides a repository for all DIP data. Streams of both categories need to be acquired, transmitted, stored, and replayed in real time. Data items are correlated across different streams with temporal indices. The audio and video streams are managed in our own High-performance Data Recording Architecture (HYDRA), which integrates multistream recording and retrieval in a consistent manner. This paper reports on the practical issues and challenges that we encountered during the design, implementation and experimental phases of our prototype. We also present some analysis results and discuss future extensions for the architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "distributed immersive performance; multimedia storage; multimodal data recorder; networked musical performance", } @Article{Hsu:2008:ACR, author = "Cheng-Hsin Hsu and Mohamed Hefeeda", title = "On the accuracy and complexity of rate-distortion models for fine-grained scalable video sequences", journal = j-TOMCCAP, volume = "4", number = "2", pages = "15:1--15:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352019", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Rate-distortion (R-D) models are functions that describe the relationship between the bitrate and expected level of distortion in the reconstructed video stream. R-D models enable optimization of the received video quality in different network conditions. Several R-D models have been proposed for the increasingly popular fine-grained scalable video sequences. However, the models' relative performance has not been thoroughly analyzed. Moreover, the time complexity of each model is not known, nor is the range of bitrates in which the model produces valid results. This lack of quantitative performance analysis makes it difficult to select the model that best suits a target streaming system. In this article, we classify, analyze, and rigorously evaluate all R-D models proposed for FGS coders in the literature. We classify R-D models into three categories: analytic, empirical, and semi-analytic. We describe the characteristics of each category. We analyze the R-D models by following their mathematical derivations, scrutinizing the assumptions made, and explaining when the assumptions fail and why. In addition, we implement all R-D models, a total of eight, and evaluate them using a diverse set of video sequences. In our evaluation, we consider various source characteristics, diverse channel conditions, different encoding/decoding parameters, different frame types, and several performance metrics including accuracy, range of applicability, and time complexity of each model. We also present clear systematic ways (pseudo codes) for constructing various R-D models from a given video sequence. Based on our experimental results, we present a justified list of recommendations on selecting the best R-D models for video-on-demand, video conferencing, real-time, and peer-to-peer streaming systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "fine-grained scalable coding; multimedia streaming; rate-distortion models", } @Article{Wang:2008:MST, author = "Bing Wang and Jim Kurose and Prashant Shenoy and Don Towsley", title = "Multimedia streaming via {TCP}: an analytic performance study", journal = j-TOMCCAP, volume = "4", number = "2", pages = "16:1--16:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1352012.1352020", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Jun 16 17:12:37 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "TCP is widely used in commercial multimedia streaming systems, with recent measurement studies indicating that a significant fraction of Internet streaming media is currently delivered over HTTP/TCP. These observations motivate us to develop analytic performance models to systematically investigate the performance of TCP for both live and stored-media streaming. We validate our models via ns simulations and experiments conducted over the Internet. Our models provide guidelines indicating the circumstances under which TCP streaming leads to satisfactory performance, showing, for example, that TCP generally provides good streaming performance when the achievable TCP throughput is roughly twice the media bitrate, with only a few seconds of startup delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "multimedia streaming; performance modeling", } @Article{Lin:2008:NNB, author = "Tsungnan Lin and Chiapin Wang and Po-Chiang Lin", title = "A neural-network-based context-aware handoff algorithm for multimedia computing", journal = j-TOMCCAP, volume = "4", number = "3", pages = "17:1--17:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386110", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The access of multimedia computing in wireless networks is concerned with the performance of handoff because of the irretrievable property of real-time data delivery. To lessen throughput degradation incurred by unnecessary handoffs or handoff latencies leading to media disruption perceived by users, this paper presents a link quality based handoff algorithm. Neural networks are used to learn the cross-layer correlation between the link quality estimator such as packet success rate and the corresponding context metric indicators, for example, the transmitting packet length, received signal strength, and signal to noise ratio. Based on a pre-processed learning of link quality profile, neural networks make essential handoff decisions efficiently with the evaluations of link quality instead of the comparisons between relative signal strength. The experiment and simulation results show that the proposed algorithm improves the user perceived qualities in a transmission scenario of VoIP applications by minimizing both the number of lost packets and unnecessary handoffs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "context-aware; handoff; Multimedia computing; neural networks", } @Article{Franke:2008:TAC, author = "Ingmar S. Franke and Sebastian Pannasch and Jens R. Helmert and Robert Rieger and Rainer Groh and Boris M. Velichkovsky", title = "Towards attention-centered interfaces: an aesthetic evaluation of perspective with eye tracking", journal = j-TOMCCAP, volume = "4", number = "3", pages = "18:1--18:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386111", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The established method of representing three-dimensional space on a two-dimensional surface involves camera based, point of regard systems, comparable in design to the early ``camera obscura''. However, geometrical limitations of such models lead to distortions of perspective when projected. This research investigated the influence of single- versus multi-perspectives on aesthetic choices within one image. A clear perceptual bias towards multi-perspective images was found, additionally supported by an eye tracking study. We propose that human users are more attracted by multi-perspective images, which emphasize the ``semantic foci'' of the scene, than by those being synthesized statically with only one geometrical prospect.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Eye tracking; perspective projection; scene perception; subjective evaluation", } @Article{Wu:2008:ELS, author = "Chuan Wu and Baochun Li and Shuqiao Zhao", title = "Exploring large-scale peer-to-peer live streaming topologies", journal = j-TOMCCAP, volume = "4", number = "3", pages = "19:1--19:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386112", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Real-world live peer-to-peer (P2P) streaming applications have been successfully deployed in the Internet, delivering live multimedia content to millions of users at any given time. With relative simplicity in design with respect to peer selection and topology construction protocols and without much algorithmic sophistication, current-generation live P2P streaming applications are able to provide users with adequately satisfying viewing experiences. That said, little existing research has provided sufficient insights on the time-varying internal characteristics of peer-to-peer topologies in live streaming. This article presents {\em Magellan}, our collaborative work with UUSee Inc., Beijing, China, for exploring and charting graph theoretical properties of practical P2P streaming topologies, gaining important insights in their topological dynamics over a long period of time.\par With more than 120 GB worth of traces starting September 2006 from a commercially deployed P2P live streaming system that represents UUSee's core product, we have completed a thorough and in-depth investigation of the topological properties in large-scale live P2P streaming, as well as their evolutionary behavior over time, for example, at different times of the day and in flash crowd scenarios. We seek to explore real-world P2P streaming topologies with respect to their graph theoretical metrics, such as the degree, clustering coefficient, and reciprocity. In addition, we compare our findings with results from existing studies on topological properties of P2P file sharing applications, and present new and unique observations specific to streaming. We have observed that live P2P streaming sessions demonstrate excellent scalability, a high level of reciprocity, a clustering phenomenon in each ISP, and a degree distribution that does {\em not\/} follow the power-law distribution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Peer-to-peer streaming; topology characterization", } @Article{Goel:2008:LLA, author = "Ashvin Goel and Charles Krasic and Jonathan Walpole", title = "Low-latency adaptive streaming over {TCP}", journal = j-TOMCCAP, volume = "4", number = "3", pages = "20:1--20:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386113", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Media streaming over TCP has become increasingly popular because TCP's congestion control provides remarkable stability to the Internet. Streaming over TCP requires adapting to bandwidth availability, but unfortunately, TCP can introduce significant latency at the application level, which causes unresponsive and poor adaptation. This article shows that this latency is not inherent in TCP but occurs as a result of throughput-optimized TCP implementations. We show that this latency can be minimized by dynamically tuning TCP's send buffer. Our evaluation shows that this approach leads to better application-level adaptation and it allows supporting interactive and other low-latency applications over TCP.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "low latency streaming; multimedia applications; TCP", } @Article{Lim:2008:DPP, author = "Seung-Ho Lim and Yo-Won Jeong and Kyu Ho Park", title = "Data placement and prefetching with accurate bit rate control for interactive media server", journal = j-TOMCCAP, volume = "4", number = "3", pages = "21:1--21:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386114", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "An interactive Media Server should support unrestricted control to viewers with their service level agreements. It is important to manage video data effectively to facilitate efficient retrieval. In this paper, we propose an efficient placement algorithm as part of an effective retrieval scheme to increase the number of clients who can be provided with interactive service. The proposed management schemes are incorporated with a bit count control method that is based on repeated tuning of quantization parameters to adjust the actual bit count to the target bit count. The encoder using this method can generate coded frames whose sizes are synchronized with the RAID stripe size, so that when various fast-forward levels are accessed we can reduce the seek and rotational latency and enhance the disk throughput of each disk in the RAID system. Experimental results demonstrate that the proposed schemes can significantly improve the average service time and guarantee more users service of quality, and the interactive media server can thereby efficiently service a large number of clients.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "bit count control; disk array; Interactive media server; stripe size; video rate", } @Article{Jie:2008:VGD, author = "Li Jie and James J. Clark", title = "Video game design using an eye-movement-dependent model of visual attention", journal = j-TOMCCAP, volume = "4", number = "3", pages = "22:1--22:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386115", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Eye movements can be used to infer the allocation of covert attention. In this article, we propose to model the allocation of attention in a task-dependent manner based on different eye movement conditions, specifically fixation and pursuit. We show that the image complexity at eye fixation points during fixation, and the pursuit direction during pursuit are significant factors in attention allocation. Results of the study are applied to the design of an interactive computer game. Real-time eye movement information is taken as one of inputs for the game. The utility of such eye information for controlling game difficulty is shown.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Entertainment; eye movements; eye tracking; HCI; video games; visual attention", } @Article{Komogortsev:2008:PRT, author = "Oleg V. Komogortsev and Javed I. Khan", title = "Predictive real-time perceptual compression based on eye-gaze-position analysis", journal = j-TOMCCAP, volume = "4", number = "3", pages = "23:1--23:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386109.1386116", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:12 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article designs a real-time perceptual compression system (RTPCS) based on eye-gaze-position analysis. Our results indicate that the eye-gaze-position containment metric provides more efficient and effective evaluation of an RTPCS than the eye fixation containment. The presented RTPCS is designed for a network communication scenario with a feedback loop delay. The proposed RTPCS uses human visual system properties to compensate for the delay and to provide high ratios of multimedia compression.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "human visual system; Real-time multimedia compression", } @Article{Cesar:2008:ISI, author = "Pablo Cesar and Dick C. A. Bulterman and Luiz Fernando Gomes Soares", title = "Introduction to special issue: {Human-centered} television --- directions in interactive digital television research", journal = j-TOMCCAP, volume = "4", number = "4", pages = "24:1--24:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412196.1412197", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:32 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The research area of interactive digital TV is in the midst of a significant revival. Unlike the first generation of digital TV, which focused on producer concerns that effectively limited (re)distribution, the current generation of research is closely linked to the role of the user in selecting, producing, and distributing content. The research field of interactive digital television is being transformed into a study of human-centered television. Our guest editorial reviews relevant aspects of this transformation in the three main stages of the content lifecycle: content production, content delivery, and content consumption. While past research on content production tools focused on full-fledged authoring tools for professional editors, current research studies lightweight, often informal end-user authoring systems. In terms of content delivery, user-oriented infrastructures such as peer-to-peer are being seen as alternatives to more traditional broadcast solutions. Moreover, end-user interaction is no longer limited to content selection, but now facilitates nonlinear participatory television productions. Finally, user-to-user communication technologies have allowed television to become a central component of an interconnected social experience. The background context given in this article provides a framework for appreciating the significance of four detailed contributions that highlight important directions in transforming interactive television research.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Interactive television; shared experiences; standards; survey", } @Article{Ursu:2008:ITN, author = "Marian F. Ursu and Maureen Thomas and Ian Kegel and Doug Williams and Mika Tuomola and Inger Lindstedt and Terence Wright and Andra Leurdijk and Vilmos Zsombori and Julia Sussner and Ulf Myrestam and Nina Hall", title = "Interactive {TV} narratives: {Opportunities}, progress, and challenges", journal = j-TOMCCAP, volume = "4", number = "4", pages = "25:1--25:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412196.1412198", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:32 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article is motivated by the question whether television should do more than simply offer interactive services alongside (and separately from) traditional linear programs, in the context of its dominance being seriously challenged and threatened by interactive forms of screen media entertainment. It suggests: yes. Interactive {\em narrativity}, that is, the ability to interact with (and influence) stories whilst they are being told, represents one clear development path for interactive television. The capabilities of computing technology are ripe for exploring this new form of storytelling, from creation to commercial distribution. The article starts by looking at the relationship between narrativity and interactivity in the current context of screen media, and identifies clear signs of interest from certain European public broadcasters in interactive TV narratives. It then presents in detail four recent experimental interactive TV productions in the genres of drama, news, and documentary, developed in collaboration with public broadcasters, which illustrate the potential and richness of this new form of storytelling, but also highlight new technological capabilities necessary for such productions. A number of essential technological requirements are then discussed in more detail in the final part. The article suggests that the ShapeShifting Media Technology, employed in the implementation of the four productions, has made significant advances both at the technological and the creative ends in supporting the development of interactive TV narrativity, but, however, that further developments are required before being able to answer questions such as ``Would end users want such a form of screen media entertainment?'' and ``Would it be effective for both end users and producers?''", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "computational narrativity; digital storytelling; entertainment; Interactive; media; narrativity; nonlinear; screen media; shapeshifting; television", } @Article{Cheng:2008:GIP, author = "Bin Cheng and Lex Stein and Hai Jin and Xiaofei Liao and Zheng Zhang", title = "{GridCast}: {Improving} peer sharing for {P2P VoD}", journal = j-TOMCCAP, volume = "4", number = "4", pages = "26:1--26:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412196.1412199", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:32 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video-on-Demand (VoD) is a compelling application, but costly. VoD is costly due to the load it places on video source servers. Many have proposed using peer-to-peer (P2P) techniques to shift load from servers to peers. Yet, nobody has implemented and deployed a system to openly and systematically evaluate how these techniques work.\par This article describes the design, implementation and evaluation of GridCast, a real deployed P2P VoD system. GridCast has been live on CERNET since May of 2006. It provides seek, pause, and play operations, and employs peer sharing to improve system scalability. In peak months, GridCast has served videos to 23,000 unique users. From the first deployment, we have gathered information to understand the system and evaluate how to further improve peer sharing through caching and replication.\par We first show that GridCast with single video caching (SVC) can decrease load on source servers by an average of 22\% from a client-server architecture. We analyze the net effect on system resources and determine that peer upload is largely idle. This leads us to changing the caching algorithm to cache multiple videos (MVC). MVC decreases source load by an average of 51\% over the client-server. The improvement is greater as user load increases. This bodes well for peer-assistance at larger scales.\par A detailed analysis of MVC shows that departure misses become a major issue in a P2P VoD system with caching optimization. Motivated by this observation, we examine how to use replication to eliminate departure misses and further reduce server load. A framework for lazy replication is presented and evaluated in this article. In this framework, two predictors are plugged in to create the working replication algorithm. With these two simple predictors, lazy replication can decrease server load by 15\% from MVC with only a minor increase in network traffic.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "caching; peer-to-peer; replication; Video-on-demand", } @Article{Metcalf:2008:EPL, author = "Crysta Metcalf and Gunnar Harboe and Joe Tullio and Noel Massey and Guy Romano and Elaine M. Huang and Frank Bentley", title = "Examining presence and lightweight messaging in a social television experience", journal = j-TOMCCAP, volume = "4", number = "4", pages = "27:1--27:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412196.1412200", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:32 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We report on a field evaluation of a prototype social television system (Social TV) that incorporates lightweight messaging as well as ambient awareness of user presence on the system. This evaluation was conducted over a two-week period and involved the participation of ten households. Participants appreciated the ability to see their buddies' presence on the system, the ability to see or suggest the programs they were currently watching, and the ability to send short messages to one another. The presence facilities available in Social TV also allowed participants to learn more about one another's TV viewing habits and preferences, and fostered a sense of connectedness between them. However, they also felt constrained by the limitations of the communication options available to them and demanded free-form text or voice chat to be able to fully express themselves.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "ambient displays; awareness displays; computer-mediated communication; Social television", } @Article{Cattelan:2008:WCP, author = "Renan G. Cattelan and Cesar Teixeira and Rudinei Goularte and Maria Da Gra{\c{c}}a C. Pimentel", title = "Watch-and-comment as a paradigm toward ubiquitous interactive video editing", journal = j-TOMCCAP, volume = "4", number = "4", pages = "28:1--28:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412196.1412201", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:32 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The literature reports research efforts allowing the editing of interactive TV multimedia documents by end-users. In this article we propose complementary contributions relative to end-user generated interactive video, video tagging, and collaboration. In earlier work we proposed the {\em watch-and-comment\/} (WaC) paradigm as the seamless capture of an individual's comments so that corresponding annotated interactive videos be automatically generated. As a proof of concept, we implemented a prototype application, the WaCTool, that supports the capture of digital ink and voice comments over individual frames and segments of the video, producing a declarative document that specifies both: different media stream structure and synchronization.\par In this article, we extend the WaC paradigm in two ways. First, user-video interactions are associated with edit commands and digital ink operations. Second, focusing on collaboration and distribution issues, we employ annotations as simple containers for context information by using them as tags in order to organize, store and distribute information in a P2P-based multimedia capture platform. We highlight the design principles of the watch-and-comment paradigm, and demonstrate related results including the current version of the WaCTool and its architecture. We also illustrate how an interactive video produced by the WaCTool can be rendered in an interactive video environment, the Ginga-NCL player, and include results from a preliminary evaluation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Annotation; Ginga-NCL; interactive digital video; P2P collaboration", } @Article{Bailey:2008:SSA, author = "Brian P. Bailey and Nicu Sebe and Alan Hanjalic", title = "Special section from the {ACM Multimedia Conference 2007}", journal = j-TOMCCAP, volume = "5", number = "1", pages = "1:1--1:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404881", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gleicher:2008:RCI, author = "Michael L. Gleicher and Feng Liu", title = "Re-cinematography: {Improving} the camerawork of casual video", journal = j-TOMCCAP, volume = "5", number = "1", pages = "2:1--2:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404882", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents an approach to postprocessing casually captured videos to improve apparent camera movement. {\em Re-cinematography\/} transforms each frame of a video such that the video better follows cinematic conventions. The approach breaks a video into shorter segments. Segments of the source video where there is no intentional camera movement are made to appear as if the camera is completely static. For segments with camera motions, camera paths are keyframed automatically and interpolated with matrix logarithms to give velocity-profiled movements that appear intentional and directed. Closeups are inserted to provide compositional variety in otherwise uniform segments. The approach automatically balances the tradeoff between motion smoothness and distortion to the original imagery. Results from our prototype show improvements to poor quality home videos.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "casual video; cinematography; Image stabilization", } @Article{Qi:2008:CMV, author = "Guo-Jun Qi and Xian-Sheng Hua and Yong Rui and Jinhui Tang and Tao Mei and Meng Wang and Hong-Jiang Zhang", title = "Correlative multilabel video annotation with temporal kernels", journal = j-TOMCCAP, volume = "5", number = "1", pages = "3:1--3:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404883", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Automatic video annotation is an important ingredient for semantic-level video browsing, search and navigation. Much attention has been paid to this topic in recent years. These researches have evolved through two paradigms. In the first paradigm, each concept is individually annotated by a pre-trained binary classifier. However, this method ignores the rich information between the video concepts and only achieves limited success. Evolved from the first paradigm, the methods in the second paradigm add an extra step on the top of the first individual classifiers to fuse the multiple detections of the concepts. However, the performance of these methods can be degraded by the error propagation incurred in the first step to the second fusion one. In this article, another paradigm of the video annotation method is proposed to address these problems. It simultaneously annotates the concepts as well as model correlations between them in one step by the proposed {\em Correlative Multilabel\/} (CML) method, which benefits from the compensation of complementary information between different labels. Furthermore, since the video clips are composed by temporally ordered frame sequences, we extend the proposed method to exploit the rich temporal information in the videos. Specifically, a temporal-kernel is incorporated into the CML method based on the discriminative information between {\em Hidden Markov Models\/} (HMMs) that are learned from the videos. We compare the performance between the proposed approach and the state-of-the-art approaches in the first and second paradigms on the widely used TRECVID data set. As to be shown, superior performance of the proposed method is gained.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "concept correlation; multilabeling; temporal kernel; Video annotation", } @Article{Chen:2008:DDN, author = "Yinpeng Chen and Weiwei Xu and Hari Sundaram and Thanassis Rikakis and Sheng-Min Liu", title = "A dynamic decision network framework for online media adaptation in stroke rehabilitation", journal = j-TOMCCAP, volume = "5", number = "1", pages = "4:1--4:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404884", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we present a media adaptation framework for an immersive biofeedback system for stroke patient rehabilitation. In our biofeedback system, media adaptation refers to changes in audio/visual feedback as well as changes in physical environment. Effective media adaptation frameworks help patients recover generative plans for arm movement with potential for significantly shortened therapeutic time. The media adaptation problem has significant challenges --- (a) high dimensionality of adaptation parameter space; (b) variability in the patient performance across and within sessions; (c) the actual rehabilitation plan is typically a non-first-order Markov process, making the learning task hard.\par Our key insight is to understand media adaptation as a real-time feedback control problem. We use a mixture-of-experts based Dynamic Decision Network (DDN) for online media adaptation. We train DDN mixtures per patient, per session. The mixture models address two basic questions --- (a) given a specific adaptation suggested by the domain experts, predict the patient performance, and (b) given the expected performance, determine the optimal adaptation decision. The questions are answered through an optimality criterion based search on DDN models trained in previous sessions. We have also developed new validation metrics and have very good results for both questions on actual stroke rehabilitation data.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Biofeedback; dynamic decision network; media adaptation; mixture of experts", } @Article{Thouin:2008:EAV, author = "Frederic Thouin and Mark Coates", title = "Equipment allocation in video-on-demand network deployments", journal = j-TOMCCAP, volume = "5", number = "1", pages = "5:1--5:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404885", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video-on-Demand (VoD) services are very user-friendly, but also complex and resource demanding. Deployments involve careful design of many mechanisms where content attributes and usage models should be taken into account. We define, and propose a methodology to solve, the {\em VoD Equipment Allocation Problem\/} of determining the number and type of streaming servers with directly attached storage (VoD servers) to install at each potential location in a metropolitan area network topology such that deployment costs are minimized. We develop a cost model for VoD deployments based on streaming, storage and transport costs and train a parametric function that maps the amount of available storage to a worst-case hit ratio. We observe the impact of having to determine the amount of storage and streaming cojointly, and determine the minimum demand required to deploy replicas as well as the average hit ratio at each location. We observe that common video-on-demand server configurations lead to the installation of excessive storage, because a relatively high hit-ratio can be achieved with small amounts of storage so streaming requirements dominate.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "equipment allocation; optimization; resource allocation; Video-on-demand", } @Article{Kolan:2008:NLV, author = "Prakash Kolan and Ram Dantu and Jo{\~a}o W. Cangussu", title = "Nuisance level of a voice call", journal = j-TOMCCAP, volume = "5", number = "1", pages = "6:1--6:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404886", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In our everyday life, we communicate with many people such as family, friends, neighbors, and colleagues. We communicate with them using different communication media such as email, telephone calls, and face-to-face interactions. While email is not real-time and face-to-face communications require geographic proximity, voice and video communications are preferred over other modes of communication. However, real-time voice/video calls may create nuisance to the receiver. In this article, we describe a mathematical model for computing nuisance level of incoming voice/video calls. We computed the closeness and nuisance level using the calling patterns between the caller and the callee. To validate the nuisance model, we collected cell phone call records of real-life people at our university and computed the nuisance value for all voice calls. We validated the nuisance levels using the feedback from those real-life people. Such a nuisance model is useful for predicting unwanted voice and video sessions in an IP communication network.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "behavior; Multimedia communications; nuisance; presence; security; tolerance; unwantedness", } @Article{Zheng:2008:CVP, author = "Qing-Fang Zheng and Wen Gao", title = "Constructing visual phrases for effective and efficient object-based image retrieval", journal = j-TOMCCAP, volume = "5", number = "1", pages = "7:1--7:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404887", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The explosion of multimedia data necessitates effective and efficient ways for us to get access to our desired ones. In this article, we draw an analogy between image retrieval and text retrieval and propose a visual phrase-based approach to retrieve images containing desired objects (object-based image retrieval). The visual phrase is defined as a pair of frequently co-occurred adjacent local image patches and is constructed using data mining. We design methods on how to construct visual phrase and how to index/search images based on visual phrase. We demonstrate experiments to show our visual phrase-based approach can be very efficient and more effective than current visual word-based approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Content-based image retrieval; inverted index; local image descriptor; object-based image retrieval; SIFT; visual phrase", } @Article{Gill:2008:SDM, author = "Phillipa Gill and Liqi Shi and Anirban Mahanti and Zongpeng Li and Derek L. Eager", title = "Scalable on-demand media streaming for heterogeneous clients", journal = j-TOMCCAP, volume = "5", number = "1", pages = "8:1--8:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404888", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Periodic broadcast protocols enable efficient streaming of highly popular media files to large numbers of concurrent clients. Most previous periodic broadcast protocols, however, assume that all clients can receive at the same rate, and also assume that reception bandwidth is not time-varying. In this article, we first develop a new periodic broadcast protocol, Optimized Heterogeneous Periodic Broadcast (OHPB), that can be optimized for a given population of clients with heterogeneous reception bandwidths and quality-of-service requirements. The OHPB protocol utilizes an optimized segment size progression determined by solving a linear optimization model that takes as input the client population characteristics and an objective function such as mean client startup delay. We then develop a generalization of the OHPB linear optimization model that allows optimal server bandwidth allocation among multiple concurrent OHPB broadcasts, wherein each media file and its clients may have different characteristics. Finally, we propose complementary client protocols employing work-ahead buffering of data during playback, so as to enable more uniform playback quality when the reception bandwidth is time-varying.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "linear programming; periodic broadcasts; quality-of-service; Scalable streaming", } @Article{Jung:2008:SSL, author = "Dawoon Jung and Jaegeuk Kim and Jin-Soo Kim and Joonwon Lee", title = "{ScaleFFS}: a scalable log-structured flash file system for mobile multimedia systems", journal = j-TOMCCAP, volume = "5", number = "1", pages = "9:1--9:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1404880.1404889", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:51:49 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "NAND flash memory has become one of the most popular storage media for mobile multimedia systems. A key issue in designing storage systems for mobile multimedia systems is handling large-capacity storage media and numerous large files with limited resources such as memory. However, existing flash file systems, including JFFS2 and YAFFS in particular, exhibit many limitations in addressing the storage capacity of mobile multimedia systems.\par In this article, we design and implement a scalable flash file system, called ScaleFFS, for mobile multimedia systems. ScaleFFS is designed to require only a small fixed amount of memory space and to provide fast mount time, even if the file system size grows to more than tens of gigabytes. The measurement results show that ScaleFFS can be instantly mounted regardless of the file system size, while achieving the same write bandwidth and up to 22\% higher read bandwidth compared to JFFS2.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "File system; flash memory; NAND; storage system", } @Article{Moncrieff:2008:DPA, author = "Simon Moncrieff and Svetha Venkatesh and Geoff West", title = "Dynamic privacy assessment in a smart house environment using multimodal sensing", journal = j-TOMCCAP, volume = "5", number = "2", pages = "10:1--10:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413863", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Surveillance applications in private environments such as smart houses require a privacy management policy if such systems are to be accepted by the occupants of the environment. This is due to the invasive nature of surveillance, and the private nature of the home. In this article, we propose a framework for dynamically altering the privacy policy applied to the monitoring of a smart house based on the situation within the environment. Initially the situation, or context, within the environment is determined; we identify several factors for determining environmental context, and propose methods to quantify the context using audio and binary sensor data. The context is then mapped to an appropriate privacy policy, which is implemented by applying data hiding techniques to control access to data gathered from various information sources. The significance of this work lies in the examination of privacy issues related to assisted-living smart house environments. A single privacy policy in such applications would be either too restrictive for an observer, for example, a carer, or too invasive for the occupants. We address this by proposing a dynamic method, with the aim of decreasing the invasiveness of the technology, while retaining the purpose of the system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Assisted living; audio; context aware; privacy; surveillance and monitoring", } @Article{Adams:2008:SUS, author = "Brett Adams and Dinh Phung and Svetha Venkatesh", title = "Sensing and using social context", journal = j-TOMCCAP, volume = "5", number = "2", pages = "11:1--11:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413864", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present online algorithms to extract social context: Social spheres are labeled locations of significance, represented as convex hulls extracted from GPS traces. Colocation is determined from Bluetooth and GPS to extract social rhythms, patterns in time, duration, place, and people corresponding to real-world activities. Social ties are formulated from proximity and shared spheres and rhythms. Quantitative evaluation is performed for 10+ million samples over 45 man-months. Applications are presented with assessment of perceived utility: {\em Socio-Graph}, a video and photo browser with filters for social metadata, and {\em Jive}, a blog browser that uses rhythms to discover similarity between entries automatically.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Multimedia browsing; social context", } @Article{Mohanty:2008:IWB, author = "Saraju P. Mohanty and Bharat K. Bhargava", title = "Invisible watermarking based on creation and robust insertion-extraction of image adaptive watermarks", journal = j-TOMCCAP, volume = "5", number = "2", pages = "12:1--12:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413865", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel invisible robust watermarking scheme for embedding and extracting a digital watermark in an image. The novelty lies in determining a perceptually important subimage in the host image. Invisible insertion of the watermark is performed in the most significant region of the host image such that tampering of that portion with an intention to remove or destroy will degrade the esthetic quality and value of the image. One feature of the algorithm is that this subimage is used as a region of interest for the watermarking process and eliminates the chance of watermark removal. Another feature of the algorithm is the creation of a compound watermark using the input user watermark (logo) and attributes of the host image. This facilitates the homogeneous fusion of a watermark with the cover image, preserves the quality of the host image, and allows robust insertion-extraction. Watermark creation consists of two distinct phases. During the first phase, a statistical image is synthesized from a perceptually important subimage of the image. A compound watermark is created by embedding a watermark (logo) into the statistical synthetic image by using a visible watermarking technique. This compound watermark is invisibly embedded into the important block of the host image. The authentication process involves extraction of the perceptive logo as well statistical testing for two-layer evidence. Results of the experimentation using standard benchmarks demonstrates the robustness and efficacy of the proposed watermarking approach. Ownership proof could be established under various hostile attacks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "content protection; copyright protection; image; invisible watermarking; Watermarking", } @Article{Yiu:2008:ODC, author = "Wai-Pun Ken Yiu and Shueng-Han Gary Chan", title = "Offering data confidentiality for multimedia overlay multicast: {Design} and analysis", journal = j-TOMCCAP, volume = "5", number = "2", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413866", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Application layer multicast (ALM) has been proposed to overcome current limitations in IP multicast for large-group multimedia communication. We address offering data confidentiality tailored for ALM. To achieve confidentiality, a node may need to continuously {\em re-encrypt\/} packets before forwarding them downstream. Furthermore, keys have to be changed whenever there is a membership change, leading to {\em rekey\/} processing overhead at the nodes. For a large and dynamic group, these reencryption and rekeying operations incur high processing overhead at the nodes. We propose and analyze a scalable scheme called Secure Overlay Multicast (SOM) which clusters ALM peers so as to localize rekeying within a cluster and to limit re-encryption at cluster boundaries, thereby minimizing the total nodal processing overhead. We describe the operations of SOM and compare its nodal processing overhead with two other basic approaches, namely, host-to-host encryption and whole group encryption. We also present a simplified analytic model for SOM and show that there exists an optimal cluster size to minimize the total nodal processing overhead. By comparing with a recently proposed ALM scheme (DT protocol), SOM achieves a substantial reduction in nodal processing overhead with similar network performance in terms of network stress and delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Key management; multicast security; overlay multicast; performance analysis", } @Article{Nakayama:2008:ECR, author = "Minoru Nakayama and Yosiyuki Takahasi", title = "Estimation of certainty for responses to multiple-choice questionnaires using eye movements", journal = j-TOMCCAP, volume = "5", number = "2", pages = "14:1--14:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413867", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "To examine the feasibility of estimating the degree of strength of belief (SOB) of responses using eye movements, the scan paths of eye movements were analyzed while subjects reviewed their own responses to multiple choice tasks. All fixation points of eye movements were classified into visual areas, or cells, which corresponded with the positions of answers. Two estimation procedures are proposed using eye-movement data. The first one is identifying SOB using scan-path transitions. By comparing subject's reports of high and low SOB and eye-movement estimations, a significant correct rate of discrimination of SOB was observed. When the threshold of discrimination was controlled, a high rate of correct responses was obtained if it was set at a low level.\par The second procedure is conducting SOB discrimination using support vector machines (SVM) trained with features of fixations. Subject's gazing features were analyzed while they reviewed their own responses. A discrimination model for SOB was trained with several combinations of features to see whether performance of a significant level could be obtained. As a result, a trained model with 3 features (which consist of interval time, vertical difference, and length between fixations) can provide significant discrimination performance for SOB.\par These results provide evidence that strength of belief can be estimated using eye movements", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "certainty; Eye-movements; scan-path analysis; support vector machines", } @Article{Shipman:2008:AVG, author = "Frank Shipman and Andreas Girgensohn and Lynn Wilcox", title = "Authoring, viewing, and generating hypervideo: an overview of {Hyper-Hitchcock}", journal = j-TOMCCAP, volume = "5", number = "2", pages = "15:1--15:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413868", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hyper-Hitchcock consists of three components for creating and viewing a form of interactive video called detail-on-demand video: a hypervideo editor, a hypervideo player, and algorithms for automatically generating hypervideo summaries. Detail-on-demand video is a form of hypervideo that supports one hyperlink at a time for navigating between video sequences. The Hyper-Hitchcock editor enables authoring of detail-on-demand video without programming and uses video processing to aid in the authoring process. The Hyper-Hitchcock player uses labels and keyframes to support navigation through and back hyperlinks. Hyper-Hitchcock includes techniques for automatically generating hypervideo summaries of one or more videos that take the form of multiple linear summaries of different lengths with links from the shorter to the longer summaries. User studies on authoring and viewing provided insight into the various roles of links in hypervideo and found that player interface design greatly affects people's understanding of hypervideo structure and the video they access.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Hypervideo; link generation; video editing; video summarization", } @Article{He:2008:EED, author = "Wenbo He and Klara Nahrstedt and Xue Liu", title = "End-to-end delay control of multimedia applications over multihop wireless links", journal = j-TOMCCAP, volume = "5", number = "2", pages = "16:1--16:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413869", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The proliferation of multimedia applications over mobile, resource-constrained wireless networks has raised the need for techniques that adapt these applications both to clients' Quality of Service (QoS) requirements and to network resource constraints. This article investigates the upper-layer adaptation mechanisms to achieve end-to-end delay control for multimedia applications. The proposed adaptation approach spans application layer, middleware layer and network layer. In application layer, the requirement adaptor dynamically changes the requirement levels according to end-to-end delay measurement and acceptable QoS requirements for the end-users. In middleware layer, the priority adaptor is used to dynamically adjust the service classes for applications using feedback control theory. In network layer, the service differentiation scheduler assigns different network resources (e.g., bandwidth) to different service classes. With the coordination of these three layers, our approach can adaptively assign resources to multimedia applications. To evaluate the impact of our adaptation scheme, we built a real IEEE 802.11 ad hoc network testbed. The test-bed experiments show that the proposed upper-layer adaptation for end-to-end delay control successfully adjusts multimedia applications to meet delay requirements in many scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "End-to-end delay QoS; wireless ad hoc networks", } @Article{Pan:2008:CBM, author = "Leon Pan and Chang N. Zhang", title = "A criterion-based multilayer access control approach for multimedia applications and the implementation considerations", journal = j-TOMCCAP, volume = "5", number = "2", pages = "17:1--17:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1413862.1413870", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:17 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, a novel criterion-based multilayer access control (CBMAC) approach is presented to enhance existing access control models such as Role-Based, Mandatory, and Discretionary Access Control models to support multilayer (multilevel) access control. The proposed approach is based on a set of predefined security criteria which are extracted from authorization rules. The security attributes of objects and users are specified by security criterion expressions (serving as locks) and the elements (serving as keys) of security criterion subsets respectively. An object embedded with a number of security criterion expressions becomes a secure object while a user associated with a security criterion subset is called a secure user. The multilayer access control is achieved by evaluating the embedded security criterion expressions (actuating locks) by the elements (keys) in a user's security criterion subset. The paper also provides the details of integrating the proposed approach with existing access control models and presents the implementation considerations of Criterion-Based Role-Based Multilayer Access Control, the integration of CBMAC and Role-Based Access Control.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Multilayer access control; secure object; secure permission; secure user; security criterion", } @Article{Candan:2009:ISS, author = "K. Sel{\c{c}}uk Candan and Alberto {Del Bimbo} and Carsten Griwodz and Alejandro Jaimes", title = "Introduction to the special section for the best papers of {ACM Multimedia 2008}", journal = j-TOMCCAP, volume = "5", number = "3", pages = "18:1--18:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556135", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cesar:2009:FTE, author = "Pablo Cesar and Dick C. A. Bulterman and Jack Jansen and David Geerts and Hendrik Knoche and William Seager", title = "Fragment, tag, enrich, and send: {Enhancing} social sharing of video", journal = j-TOMCCAP, volume = "5", number = "3", pages = "19:1--19:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The migration of media consumption to personal computers retains distributed social viewing, but only via nonsocial, strictly personal interfaces. This article presents an architecture, and implementation for media sharing that allows for enhanced social interactions among users. Using a mixed-device model, our work allows targeted, personalized enrichment of content. All recipients see common content, while differentiated content is delivered to individuals via their personal secondary screens. We describe the goals, architecture, and implementation of our system in this article. In order to validate our results, we also present results from two user studies involving disjoint sets of test participants.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Asynchronous media sharing; differentiated content enrichment; secondary screens", } @Article{Knoche:2009:BPS, author = "H. Knoche and M. A. Sasse", title = "The big picture on small screens delivering acceptable video quality in mobile {TV}", journal = j-TOMCCAP, volume = "5", number = "3", pages = "20:1--20:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mobile TV viewers can change the viewing distance and (on some devices) scale the picture to their preferred viewing ratio, trading off size for angular resolution. We investigated optimal trade-offs between size and resolution through a series of studies. Participants selected their preferred size and rated the acceptability of the visual experience on a 200ppi device at a 4:3 aspect ratio. They preferred viewing ratios similar to living room TV setups regardless of the much lower resolution: at a minimum 14 pixels per degree. While traveling on trains people required videos with a height larger than 35mm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Mobile multimedia consumption; resolution; size; trade-off", } @Article{Mondet:2009:CPP, author = "Sebastien Mondet and Wei Cheng and Geraldine Morin and Romulus Grigoras and Frederic Boudon and Wei Tsang Ooi", title = "Compact and progressive plant models for streaming in networked virtual environments", journal = j-TOMCCAP, volume = "5", number = "3", pages = "21:1--21:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556138", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Just as in the real world, plants are important objects in virtual worlds for creating pleasant and realistic environments, especially those involving natural scenes. As such, much effort has been made in realistic modeling of plants. As the trend moves towards networked and distributed virtual environments, however, the current models are inadequate as they are not designed for progressive transmissions. In this article, we fill in this gap by proposing a progressive representation for plants based on generalized cylinders. We model the shape and thickness of branches in a plant as B{\'e}zier curves, group the curves according to the similarity, and differentially code the curves to represent the plant in a compact and progressive manner. To facilitate the transmission of the plants, we quantify the visual contribution of each branch and use this weight in packet scheduling. We show the efficiency of our representations and the effectiveness of our packet scheduler through experiments over a wide area network.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "multiresolution; networked virtual environment; plant models; progressive coding; progressive transmission; Streaming", } @Article{Wei:2009:CCM, author = "Yong Wei and Suchendra M. Bhandarkar and Kang Li", title = "Client-centered multimedia content adaptation", journal = j-TOMCCAP, volume = "5", number = "3", pages = "22:1--22:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556139", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The design and implementation of a client-centered multimedia content adaptation system suitable for a mobile environment comprising of resource-constrained handheld devices or clients is described. The primary contributions of this work are: (1) the overall architecture of the client-centered content adaptation system, (2) a data-driven multi-level Hidden Markov model (HMM)-based approach to perform both video segmentation and video indexing in a single pass, and (3) the formulation and implementation of a Multiple-choice Multidimensional Knapsack Problem (MMKP)-based video personalization strategy. In order to segment and index video data, a video stream is modeled at both the semantic unit level and video program level. These models are learned entirely from training data and no domain-dependent knowledge about the structure of video programs is used. This makes the system capable of handling various kinds of videos without having to manually redefine the program model. The proposed MMKP-based personalization strategy is shown to include more relevant video content in response to the client's request than the existing 0/1 knapsack problem and fractional knapsack problem-based strategies, and is capable of satisfying multiple client-side constraints simultaneously. Experimental results on CNN news videos and Major League Soccer (MLS) videos are presented and analyzed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "hidden Markov models; multiple choice multidimensional knapsack problem; video indexing; Video personalization", } @Article{Sivaram:2009:DMS, author = "G. S. V. S. Sivaram and Mohan S. Kankanhalli and K. R. Ramakrishnan", title = "Design of multimedia surveillance systems", journal = j-TOMCCAP, volume = "5", number = "3", pages = "23:1--23:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article addresses the problem of how to select the optimal combination of sensors and how to determine their optimal placement in a surveillance region in order to meet the given performance requirements at a minimal cost for a multimedia surveillance system. We propose to solve this problem by obtaining a performance vector, with its elements representing the performances of subtasks, for a given input combination of sensors and their placement. Then we show that the optimal sensor selection problem can be converted into the form of Integer Linear Programming problem (ILP) by using a linear model for computing the optimal performance vector corresponding to a sensor combination. Optimal performance vector corresponding to a sensor combination refers to the performance vector corresponding to the optimal placement of a sensor combination. To demonstrate the utility of our technique, we design and build a surveillance system consisting of PTZ (Pan-Tilt-Zoom) cameras and active motion sensors for capturing faces. Finally, we show experimentally that optimal placement of sensors based on the design maximizes the system performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Performance vector; sensor selection and placement", } @Article{Liu:2009:SSE, author = "Xiaotao Liu and Mark Corner and Prashant Shenoy", title = "{\em {SEVA\/}}: {Sensor-enhanced} video annotation", journal = j-TOMCCAP, volume = "5", number = "3", pages = "24:1--24:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556141", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we study how a sensor-rich world can be exploited by digital recording devices such as cameras and camcorders to improve a user's ability to search through a large repository of image and video files. We design and implement a digital recording system that records identities and locations of objects (as advertised by their sensors) along with visual images (as recorded by a camera). The process, which we refer to as {\em Sensor-Enhanced Video Annotation (SEVA)}, combines a series of correlation, interpolation, and extrapolation techniques. It produces a tagged stream that later can be used to efficiently search for videos or frames containing particular objects or people. We present detailed experiments with a prototype of our system using both stationary and mobile objects as well as GPS and ultrasound. Our experiments show that: (i) SEVA has zero error rates for static objects, except very close to the boundary of the viewable area; (ii) for moving objects or a moving camera, SEVA only misses objects leaving or entering the viewable area by 1--2 frames; (iii) SEVA can scale to 10 fast-moving objects using current sensor technology; and (iv) SEVA runs online using relatively inexpensive hardware.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "context-based retrieval; location-based services; sensor-enhanced; Video annotation", } @Article{Wang:2009:MLS, author = "Bing Wang and Wei Wei and Zheng Guo and Don Towsley", title = "Multipath live streaming via {TCP}: {Scheme}, performance and benefits", journal = j-TOMCCAP, volume = "5", number = "3", pages = "25:1--25:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556142", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Motivated by the wide use of TCP for multimedia streaming in practice and the increasing availability of multipath between end hosts, we study multipath live streaming via TCP in this article. We first design a simple and practical TCP-based multipath streaming scheme, named {\em Dynamic MPath-streaming (DMP-streaming)}, which dynamically distributes packets over multiple paths by {\em implicitly inferring\/} the available bandwidths on these paths. To allow systematic performance study, we develop an analytical model for DMP-streaming and validate the model using extensive {\em ns\/} simulation and Internet experiments. We explore the parameter space of this model and find that DMP-streaming generally provides satisfactory performance when the aggregate achievable TCP throughput is 1.6 times the video bitrate, when allowing a few seconds of startup delay. Last, we comment on the benefits of using multipath versus single path for TCP-based streaming.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "multimedia streaming; Performance modeling", } @Article{Li:2009:PBR, author = "Mingzhe Li and Mark Claypool and Robert Kinicki", title = "Playout buffer and rate optimization for streaming over {IEEE 802.11} wireless networks", journal = j-TOMCCAP, volume = "5", number = "3", pages = "26:1--26:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1556134.1556143", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:52:39 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Most streaming rate selection and buffer optimization algorithms are developed for wired networks and can perform poorly over wireless networks. Wireless MAC layer behavior, such as rate adaptation, retransmissions, and medium sharing, can significantly degrade the effectiveness of current streaming algorithms. This article presents the Buffer and Rate Optimization for Streaming (BROS) algorithm to improve streaming performance. BROS uses a bandwidth estimation tool designed specifically for wireless networks and models the relationship between buffer size, streaming data rate, and available bandwidth distribution. BROS optimizes the streaming data rate and initial buffer size, resulting in a high data rate but with few frame losses and buffer underflow events, while still keeping a small initial buffer delay. BROS is implemented in the Emulated Streaming (EmuS) client-server system and evaluated on an IEEE 802.11 wireless testbed with various wireless conditions. The evaluation shows that BROS can effectively optimize the streaming rate and initial buffer size based on wireless network bandwidth conditions, thus achieving better performance than static rate or buffer selection and jitter removal buffers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Multimedia networking; playout buffer; streaming rate; wireless networks", } @Article{Sauer:2009:MDC, author = "Danielle Sauer and Yee-Hong Yang", title = "Music-driven character animation", journal = j-TOMCCAP, volume = "5", number = "4", pages = "27:1--27:??", month = oct, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1596990.1596991", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Music-driven character animation extracts musical features from a song and uses them to create an animation. This article presents a system that builds a new animation directly from musical attributes, rather than simply synchronizing it to the music like similar systems. Using a simple script that identifies the movements involved in the performance and their timing, the user can easily control the animation of characters. Another unique feature of the system is its ability to incorporate multiple characters into the same animation, both with synchronized and unsynchronized movements. A system that integrates Celtic dance movements is developed in this article. An evaluation of the results shows that the majority of animations are found to be appealing to viewers and that altering the music can change the attractiveness of the final result.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Character animation; motion synthesis; music analysis; primitive movements", } @Article{Deng:2009:SCA, author = "Robert H. Deng and Yanjiang Yang", title = "A study of content authentication in proxy-enabled multimedia delivery systems: {Model}, techniques, and applications", journal = j-TOMCCAP, volume = "5", number = "4", pages = "28:1--28:??", month = oct, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1596990.1596992", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Compared with the direct server-user approach, the server-proxy-user architecture for multimedia delivery promises significantly improved system scalability. The introduction of the intermediary transcoding proxies between content servers and end users in this architecture, however, brings unprecedented challenges to content security. In this article, we present a systematic study on the end-to-end content authentication problem in the server-proxy-user context, where intermediary proxies transcode multimedia content dynamically. We present a formal model for the authentication problem, propose a concrete construction for authenticating generic data modality and formally prove its security. We then apply the generic construction to authenticating specific multimedia formats, for example, JPEG2000 code-streams and MPEG-4 video streams. The prototype implementation shows that our scheme is suitable for practical applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "end-to-end authentication; Multimedia content delivery; security", } @Article{Cha:2009:TVS, author = "Jongeun Cha and Mohamad Eid and Abdulmotaleb {El Saddik}", title = "Touchable {$3$D} video system", journal = j-TOMCCAP, volume = "5", number = "4", pages = "29:1--29:??", month = oct, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1596990.1596993", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multimedia technologies are reaching the limits of providing audio-visual media that viewers consume passively. An important factor, which will ultimately enhance the user's experience in terms of impressiveness and immersion, is interaction. Among daily life interactions, haptic interaction plays a prominent role in enhancing the quality of experience of users, and in promoting physical and emotional development. Therefore, a critical step in multimedia research is expected to bring the sense of touch, or haptics, into multimedia systems and applications. This article proposes a touchable 3D video system where viewers can actively touch a video scene through a force-feedback device, and presents the underlying technologies in three functional components: (1) contents generation, (2) contents transmission, and (3) viewing and interaction. First of all, we introduce a depth image-based haptic representation (DIBHR) method that adds haptic and heightmap images, in addition to the traditional depth image-based representation (DIBR), to encode the haptic surface properties of the video media. In this representation, the haptic image contains the stiffness, static friction, and dynamic friction, whereas the heightmap image contains roughness of the video contents. Based on this representation method, we discuss how to generate synthetic and natural (real) video media through a 3D modeling tool and a depth camera, respectively. Next, we introduce a transmission mechanism based on the MPEG-4 framework where new MPEG-4 BIFS nodes are designed to describe the haptic scene. Finally, a haptic rendering algorithm to compute the interaction force between the scene and the viewer is described. As a result, the performance of the haptic rendering algorithm is evaluated in terms of computational time and smooth contact force. It operates marginally within a 1 kHz update rate that is required to provide stable interaction force and provide smoother contact force with the depth image that has high frequency geometrical noise using a median filter.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "haptic rendering algorithm; Haptic surface properties; video representation", } @Article{Benevenuto:2009:VIO, author = "Fabr{\'\i}cio Benevenuto and Tiago Rodrigues and Virgilio Almeida and Jussara Almeida and Keith Ross", title = "Video interactions in online video social networks", journal = j-TOMCCAP, volume = "5", number = "4", pages = "30:1--30:??", month = oct, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1596990.1596994", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article characterizes video-based interactions that emerge from YouTube's video response feature, which allows users to discuss themes and to provide reviews for products or places using much richer media than text. Based on crawled data covering a representative subset of videos and users, we present a characterization from two perspectives: the video response view and the interaction network view. In addition to providing valuable statistical models for various characteristics, our study uncovers typical user behavioral patterns in video-based environments and shows evidence of opportunistic behavior.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "opportunistic behavior; promotion; social media; social networks; video communication; Video interactions; video spam; YouTube", } @Article{Erdmann:2009:IEB, author = "Maike Erdmann and Kotaro Nakayama and Takahiro Hara and Shojiro Nishio", title = "Improving the extraction of bilingual terminology from {Wikipedia}", journal = j-TOMCCAP, volume = "5", number = "4", pages = "31:1--31:??", month = oct, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1596990.1596995", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Research on the automatic construction of bilingual dictionaries has achieved impressive results. Bilingual dictionaries are usually constructed from parallel corpora, but since these corpora are available only for selected text domains and language pairs, the potential of other resources is being explored as well.\par In this article, we want to further pursue the idea of using Wikipedia as a corpus for bilingual terminology extraction. We propose a method that extracts term-translation pairs from different types of Wikipedia link information. After that, an SVM classifier trained on the features of manually labeled training data determines the correctness of unseen term-translation pairs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Bilingual dictionary; link analysis; Wikipedia mining", } @Article{Carlsson:2010:SSL, author = "Niklas Carlsson and Derek L. Eager", title = "Server selection in large-scale video-on-demand systems", journal = j-TOMCCAP, volume = "6", number = "1", pages = "1:1--1:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671955", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video on demand, particularly with user-generated content, is emerging as one of the most bandwidth-intensive applications on the Internet. Owing to content control and other issues, some video-on-demand systems attempt to prevent downloading and peer-to-peer content delivery. Instead, such systems rely on server replication, such as via third-party content distribution networks, to support video streaming (or pseudostreaming) to their clients. A major issue with such systems is the cost of the required server resources.\par By synchronizing the video streams for clients that make closely spaced requests for the same video from the same server, server costs (such as for retrieval of the video data from disk) can be amortized over multiple requests. A fundamental trade-off then arises, however, with respect to server selection. Network delivery cost is minimized by selecting the {\em nearest\/} server, while server cost is minimized by directing closely spaced requests for the same video to a {\em common\/} server.\par This article compares classes of server selection policies within the context of a simple system model. We conclude that: (i) server selection using dynamic system state information (rather than only proximities and average loads) can yield large improvements in performance, (ii) deferring server selection for a request as late as possible (i.e., until just before streaming is to begin) can yield additional large improvements, and (iii) within the class of policies using dynamic state information and deferred selection, policies using only ``local'' (rather than global) request information are able to achieve most of the potential performance gains.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "content distribution networks; modeling; Performance analysis; server selection; video-on-demand", } @Article{Agarwal:2010:BRW, author = "Parag Agarwal and Balakrishnan Prabhakaran", title = "Blind robust watermarking of {$3$D} motion data", journal = j-TOMCCAP, volume = "6", number = "1", pages = "2:1--2:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671956", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The article addresses the problem of copyright protection for 3D motion-captured data by designing a robust blind watermarking mechanism. The mechanism segments motion capture data and identifies clusters of 3D points per segment. A watermark can be embedded and extracted within these clusters by using a proposed extension of 3D quantization index modulation. The watermarking scheme is blind in nature and the encoded watermarks are shown to be imperceptible, and secure. The resulting hiding capacity has bounds based on cluster size. The watermarks are shown to be robust against attacks such as uniform affine transformations (scaling, rotation, and translation), cropping, reordering, and noise addition. The time complexity for watermark embedding and extraction is estimated as O({\em n\/} log {\em n\/}) and O({\em n\/}$^2$ log {\em n\/}), respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "blind; decoding; encoding; spatial; Watermarking", } @Article{Yang:2010:DMD, author = "Bo Yang", title = "{DSI}: a model for distributed multimedia semantic indexing and content integration", journal = j-TOMCCAP, volume = "6", number = "1", pages = "3:1--3:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671957", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Considerable research has been done on the content-based multimedia delivery and access in distributed data repositories. As noted in the literature, there is always a trade-off between multimedia quality and access speed. In addition, the overall performance is greatly determined by the distribution of the multimedia data. In this article, an unsupervised multimedia semantic integration approach for a distributed infrastructure, the Distributed Semantic Indexing (DSI), is presented that addresses both the data quality and search performance. With the ability of summarizing content information and guiding data distribution, the proposed approach is distinguished by: (1) logic-based representation and concise abstraction of the semantic contents of multimedia data, which are further integrated to form a general overview of a multimedia data repository --- content signature; (2) application of linguistic relationships to construct a hierarchical metadata based on the content signatures allowing imprecise queries; and (3) achieving the optimal performance in terms of search cost. The fundamental structure of the proposed model is presented. The proposed scheme has been simulated and the simulation results are analyzed and compared against several other approaches that have been advocated in the literature.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "distributed indexing; image retrieval; Semantic representation", } @Article{Nystrom:2010:ECO, author = "Marcus Nystr{\"o}m and Kenneth Holmqvist", title = "Effect of compressed offline foveated video on viewing behavior and subjective quality", journal = j-TOMCCAP, volume = "6", number = "1", pages = "4:1--4:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671958", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Offline foveation is a technique to improve the compression efficiency of digitized video. The general idea behind offline foveation is to blur video regions where no or a small number of previewers look without decreasing the subjective quality for later viewers. It relies on the fact that peripheral vision is reduced compared to central vision, and the observation that during free-viewing humans' gaze positions generally coincide when watching video. In this article, we conduct two experiments to assess how offline foveation affects viewing behavior and subjective quality. In the first experiment, 15 subjects free-viewed six video clips before and after offline foveation whereas in the second experiment we had 17 subjects assessing the quality of these videos after one, two, and three consecutive viewings. Eye movements were measured during the experiments. Results showed that, although offline foveation prior to encoding with H.264 yielded data reductions up to 52\% (20\% average) on the tested videos, it had little or no effect on where people looked, their intersubject dispersion, fixation duration, saccade amplitude, or the experienced quality during first-time viewing. However, seeing the videos more than once increased the intersubject dispersion and decreased the subjective quality. In view of these results, we discuss the usage of offline foveated video in practical applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Eye-tracking; foveation; subjective quality; video compression", } @Article{Ivanov:2010:RTH, author = "Yuri V. Ivanov and C. J. Bleakley", title = "Real-time {H.264} video encoding in software with fast mode decision and dynamic complexity control", journal = j-TOMCCAP, volume = "6", number = "1", pages = "5:1--5:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671959", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel real-time algorithm for reducing and dynamically controlling the computational complexity of an H.264 video encoder implemented in software. A fast mode decision algorithm, based on a Pareto-optimal macroblock classification scheme, is combined with a dynamic complexity control algorithm that adjusts the MB class decisions such that a constant frame rate is achieved. The average coding efficiency of the proposed algorithm was found to be similar to that of conventional encoding operating at half the frame rate. The proposed algorithm was found to provide lower average bitrate and distortion than static complexity scaling.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "complexity; complexity control; fast mode decision; H/264/AVC; mode decision; rate distortion; real time", } @Article{Hefeeda:2010:ASM, author = "Mohamed Hefeeda and Kianoosh Mokhtarian", title = "Authentication schemes for multimedia streams: {Quantitative} analysis and comparison", journal = j-TOMCCAP, volume = "6", number = "1", pages = "6:1--6:??", month = feb, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671954.1671960", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Mar 16 18:53:23 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid increase in the demand for multimedia services, securing the delivery of multimedia content has become an important issue. Accordingly, the problem of multimedia stream authentication has received considerable attention by previous research and various solutions have been proposed. However, these solutions have not been rigorously analyzed and contrasted to each other, and thus their relative suitability for different streaming environments is not clear. This article presents comprehensive analysis and comparison among different schemes proposed in the literature to authenticate multimedia streams. Authentication schemes for nonscalable and scalable multimedia streams are analyzed. To conduct this analysis, we define five important performance metrics, which are computation cost, communication overhead, receiver buffer size, delay, and tolerance to packet losses. We derive analytic formulas for these metrics for all considered authentication schemes to numerically analyze their performance. In addition, we implement all schemes in a simulator to study and compare their performance in different environments. The parameters for the simulator are carefully chosen to mimic realistic settings. We draw several conclusions on the advantages and disadvantages of each scheme. We extend our analysis to authentication techniques for scalable streams. We pay careful attention to the flexibility of scalable streams and analyze its impacts on the authentication schemes. Our analysis and comparison reveal the merits and shortcomings of each scheme, provide guidelines on choosing the most appropriate scheme for a given multimedia streaming application, and could stimulate designing new authentication schemes or improving existing ones. For example, our detailed analysis has led us to design a new authentication scheme that combines the best features of two previous schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "authentication schemes; Multimedia authentication; multimedia security; multimedia streaming; scalable coding; secure streaming", } @Article{Yang:2010:EMP, author = "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and Gregorij Kurillo and Ruzena Bajcsy", title = "Enabling multi-party {$3$D} tele-immersive environments with {{\em ViewCast}}", journal = j-TOMCCAP, volume = "6", number = "2", pages = "7:1--7:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671962.1671963", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Aug 14 17:17:15 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Three-dimensional tele-immersive (3DTI) environments have great potential to promote collaborative work among geographically distributed users. However, most existing 3DTI systems only work with two sites due to the huge demand of resources and the lack of a simple yet powerful networking model to handle connectivity, scalability, and quality-of-service (QoS) guarantees.\par In this article, we explore the design space from the angle of multi-stream management to enable multi-party 3DTI communication. Multiple correlated 3D video streams are employed to provide a comprehensive representation of the physical scene in each 3DTI environment, and are rendered together to establish a common cyberspace among all participating 3DTI environments. The existence of multi-stream correlation provides the unique opportunity for new approaches in QoS provisioning. Previous work mostly concentrated on compression and adaptation techniques on the per-stream basis while ignoring the application layer semantics and the coordination required among streams. We propose an innovative and generalized {\em ViewCast\/} model to coordinate the multi-stream content dissemination over an overlay network. ViewCast leverages view semantics in 3D free-viewpoint video systems to fill the gap between high-level user interest and low-level stream management. In ViewCast, only the view information is specified by the user/application, while the underlying control dynamically performs stream differentiation, selection, coordination, and dissemination. We present the details of ViewCast and evaluate it through both simulation and 3DTI sessions among tele-immersive environments residing in different institutes across the Internet2. Our experimental results demonstrate the implementation feasibility and performance enhancement of ViewCast in supporting multi-party 3DTI collaboration.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "3D tele-immersion; application level multicast; distributed multimedia system; multi-stream coordination; networking protocol; QoS adaptation", } @Article{Wu:2010:ELT, author = "Junwen Wu and Mohan M. Trivedi", title = "An eye localization, tracking and blink pattern recognition system: {Algorithm} and evaluation", journal = j-TOMCCAP, volume = "6", number = "2", pages = "8:1--8:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671962.1671964", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Aug 14 17:17:15 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study is to investigate the fundamental problems of, (1) facial feature detection and localization, especially eye features; and (2) eye dynamics, including tracking and blink detection. We first describe our contribution to eye localization. Following that, we discuss a simultaneous eye tracking and blink detection system. Facial feature detection is solved in a general object detection framework and its performance for eye localization is presented. A binary tree representation based on feature dependency partitions the object feature space in a coarse to fine manner. In each compact feature subspace, independent component analysis (ICA) is used to get the independent sources, whose probability density functions (PDFs) are modeled by Gaussian mixtures. When applying this representation for the task of eye detection, a subwindow is used to scan the entire image and each obtained image patch is examined using Bayesian criteria to determine the presence of an eye subject. After the eyes are automatically located with binary tree-based probability learning, interactive particle filters are used for simultaneously tracking the eyes and detecting the blinks. The particle filters use classification-based observation models, in which the posterior probabilities are evaluated by logistic regressions in tensor subspaces. Extensive experiments are used to evaluate the performance from two aspects, (1) blink detection rate and the accuracy of blink duration in terms of the frame numbers; (2) eye tracking accuracy. We also present an experimental setup for obtaining the benchmark data in tracking accuracy evaluation. The experimental evaluation demonstrates the capability of this approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Eye blink detection; human computer interface; particle filtering; video processing", } @Article{Jin:2010:DMN, author = "Xing Jin and S.-H. Gary Chan", title = "Detecting malicious nodes in peer-to-peer streaming by peer-based monitoring", journal = j-TOMCCAP, volume = "6", number = "2", pages = "9:1--9:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671962.1671965", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Aug 14 17:17:15 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current peer-to-peer (P2P) streaming systems often assume that nodes cooperate to upload and download data. However, in the open environment of the Internet, this is not necessarily true and there exist malicious nodes in the system. In this article, we study malicious actions of nodes that can be detected through peer-based monitoring. We require each node to monitor the data received and to periodically send monitoring messages about its neighbors to some trustworthy nodes. To efficiently store and search messages among multiple trustworthy nodes, we organize trustworthy nodes into a threaded binary tree. Trustworthy nodes also dynamically redistribute monitoring messages among themselves to achieve load balancing. Our simulation results show that this scheme can efficiently detect malicious nodes with high accuracy, and that the dynamic redistribution method can achieve good load balancing among trustworthy nodes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Malicious nodes; peer monitoring; peer-to-peer streaming", } @Article{Chiu:2010:FMH, author = "Chih-Yi Chiu and Hsin-Min Wang and Chu-Song Chen", title = "Fast min-hashing indexing and robust spatio-temporal matching for detecting video copies", journal = j-TOMCCAP, volume = "6", number = "2", pages = "10:1--10:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671962.1671966", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Aug 14 17:17:15 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The increase in the number of video copies, both legal and illegal, has become a major problem in the multimedia and Internet era. In this article, we propose a novel method for detecting various video copies in a video sequence. To achieve fast and robust detection, the method fully integrates several components, namely the min-hashing signature to compactly represent a video sequence, a spatio-temporal matching scheme to accurately evaluate video similarity compiled from the spatial and temporal aspects, and some speedup techniques to expedite both min-hashing indexing and spatio-temporal matching. The results of experiments demonstrate that, compared to several baseline methods with different feature descriptors and matching schemes, the proposed method which combines both global and local feature descriptors yields the best performance when encountering a variety of video transformations. The method is very fast, requiring approximately 0.06 seconds to search for copies of a thirty-second video clip in a six-hour video sequence.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Content-based copy detection; histogram pruning; near-duplicate", } @Article{Sarhan:2010:WTP, author = "Nabil J. Sarhan and Mohammad A. Alsmirat and Musab Al-Hadrusi", title = "Waiting-time prediction in scalable on-demand video streaming", journal = j-TOMCCAP, volume = "6", number = "2", pages = "11:1--11:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1671962.1671967", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Sat Aug 14 17:17:15 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Providing video streaming users with expected waiting times enhances their perceived quality-of-service (QoS) and encourages them to wait. In the absence of any waiting-time feedback, users are more likely to defect because of the uncertainty as to when their services will start. We analyze waiting-time predictability in scalable video streaming. We propose two prediction schemes and study their effectiveness when applied with various stream merging techniques and scheduling policies. The results demonstrate that the waiting time can be predicted accurately, especially when enhanced cost-based scheduling is applied. The combination of waiting-time prediction and cost-based scheduling leads to outstanding performance benefits.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", keywords = "Scheduling; stream merging; time-of-service guarantees; video streaming; waiting-time prediction", } @Article{Xu:2010:IBP, author = "Changsheng Xu and Eckehard Steinbach and Abdulmotaleb {El Saddik} and Michelle Zhou", title = "Introduction to the best papers of {ACM Multimedia 2009}", journal = j-TOMCCAP, volume = "6", number = "3", pages = "12:1--12:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1830482", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zha:2010:VQS, author = "Zheng-Jun Zha and Linjun Yang and Tao Mei and Meng Wang and Zengfu Wang and Tat-Seng Chua and Xian-Sheng Hua", title = "Visual query suggestion: {Towards} capturing user intent in {Internet} image search", journal = j-TOMCCAP, volume = "6", number = "3", pages = "13:1--13:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jiang:2010:AVA, author = "Wei Jiang and Courtenay Cotton and Shih-Fu Chang and Dan Ellis and Alexander C. Loui", title = "Audio-visual atoms for generic video concept classification", journal = j-TOMCCAP, volume = "6", number = "3", pages = "14:1--14:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823748", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{DeOliveira:2010:LND, author = "Rodrigo {De Oliveira} and Mauro Cherubini and Nuria Oliver", title = "Looking at near-duplicate videos from a human-centric perspective", journal = j-TOMCCAP, volume = "6", number = "3", pages = "15:1--15:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823749", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yin:2010:LEC, author = "Hao Yin and Xuening Liu and Tongyu Zhan and Vyas Sekar and Feng Qiu and Chuang Lin and Hui Zhang and Bo Li", title = "{LiveSky}: {Enhancing} {CDN} with {P2P}", journal = j-TOMCCAP, volume = "6", number = "3", pages = "16:1--16:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Money:2010:EEL, author = "Arthur G. Money and Harry Agius", title = "{ELVIS}: {Entertainment-Led VIdeo Summaries}", journal = j-TOMCCAP, volume = "6", number = "3", pages = "17:1--17:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823751", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hoi:2010:SSD, author = "Steven C. h. Hoi and Wei Liu and Shih-Fu Chang", title = "Semi-supervised distance metric learning for collaborative image retrieval and clustering", journal = j-TOMCCAP, volume = "6", number = "3", pages = "18:1--18:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823752", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Maddage:2010:WLA, author = "Namunu C. Maddage and Khe Chai Sim and Haizhou Li", title = "Word level automatic alignment of music and lyrics using vocal synthesis", journal = j-TOMCCAP, volume = "6", number = "3", pages = "19:1--19:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823753", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Qudah:2010:EDD, author = "Bashar Qudah and Nabil J. Sarhan", title = "Efficient delivery of on-demand video streams to heterogeneous receivers", journal = j-TOMCCAP, volume = "6", number = "3", pages = "20:1--20:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823754", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gomes:2010:STA, author = "Jo{\~a}o V. P. Gomes and Pedro R. M. In{\'a}cio and Branka Lakic and M{\'a}rio M. Freire and Henrique J. A. Da Silva and Paulo P. Monteiro", title = "Source traffic analysis", journal = j-TOMCCAP, volume = "6", number = "3", pages = "21:1--21:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1823755", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Boll:2010:CPA, author = "Susanne Boll and Jiebo Luo and Ramesh Jain and Dong Xu", title = "Call for papers: {ACM Transactions on Multimedia Computing, Communications and Applications} special issue on social media", journal = j-TOMCCAP, volume = "6", number = "3", pages = "22:1--22:??", month = aug, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1823746.1837254", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2010:OOD, author = "Ralf Steinmetz", title = "Obituary to our dear friend {Professor Dr. Nicolas D. Georganas, PhD}", journal = j-TOMCCAP, volume = "6", number = "4", pages = "23:1--23:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865107", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Haenselmann:2010:FSI, author = "Thomas Haenselmann", title = "Foreword to the special issue on multimedia sensor fusion", journal = j-TOMCCAP, volume = "6", number = "4", pages = "24:1--24:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865108", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2010:MBA, author = "Xiangyu Wang and Mohan Kankanhalli", title = "{MultiFusion}: a boosting approach for multimedia fusion", journal = j-TOMCCAP, volume = "6", number = "4", pages = "25:1--25:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865109", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chetty:2010:MSF, author = "Girija Chetty and Matthew White", title = "Multimedia sensor fusion for retrieving identity in biometric access control systems", journal = j-TOMCCAP, volume = "6", number = "4", pages = "26:1--26:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865110", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Friedland:2010:DAS, author = "Gerald Friedland and Chuohao Yeo and Hayley Hung", title = "Dialocalization: {Acoustic} speaker diarization and visual localization as joint optimization problem", journal = j-TOMCCAP, volume = "6", number = "4", pages = "27:1--27:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865111", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rahman:2010:SGA, author = "Abu Saleh Md Mahfujur Rahman and M. Anwar Hossain and Abdulmotaleb {El Saddik}", title = "Spatial-geometric approach to physical mobile interaction based on accelerometer and {IR} sensory data fusion", journal = j-TOMCCAP, volume = "6", number = "4", pages = "28:1--28:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865112", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2010:EMT, author = "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and Gregorij Kurillo and Ruzena Bajcsy", title = "Enabling multiparty {$3$D} tele-immersive environments with {ViewCast}", journal = j-TOMCCAP, volume = "6", number = "4", pages = "29:1--29:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865113", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Marshall:2010:OCM, author = "Damien Marshall and S{\'e}amus Mcloone and Tom{\'a}s Ward", title = "Optimizing consistency by maximizing bandwidth usage in distributed interactive applications", journal = j-TOMCCAP, volume = "6", number = "4", pages = "30:1--30:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865114", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Vu:2010:UOC, author = "Long Vu and Indranil Gupta and Klara Nahrstedt and Jin Liang", title = "Understanding overlay characteristics of a large-scale peer-to-peer {IPTV} system", journal = j-TOMCCAP, volume = "6", number = "4", pages = "31:1--31:??", month = nov, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1865106.1865115", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Tue Nov 23 10:03:16 MST 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Meyer:2011:MRL, author = "Marek Meyer and Christoph Rensing and Ralf Steinmetz", title = "Multigranularity reuse of learning resources", journal = j-TOMCCAP, volume = "7", number = "1", pages = "1:1--1:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870122", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bouyakoub:2011:SBI, author = "Samia Bouyakoub and Abdelkader Belkhir", title = "{SMIL} builder: an incremental authoring tool for {SMIL Documents}", journal = j-TOMCCAP, volume = "7", number = "1", pages = "2:1--2:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hossain:2011:MAQ, author = "M. Anwar Hossain and Pradeep K. Atrey and Abdulmotaleb {El Saddik}", title = "Modeling and assessing quality of information in multisensor multimedia monitoring systems", journal = j-TOMCCAP, volume = "7", number = "1", pages = "3:1--3:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870124", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhu:2011:NDK, author = "Jianke Zhu and Steven C. H. Hoi and Michael R. Lyu and Shuicheng Yan", title = "Near-duplicate keyframe retrieval by semi-supervised learning and nonrigid image matching", journal = j-TOMCCAP, volume = "7", number = "1", pages = "4:1--4:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870125", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hsu:2011:FCL, author = "Cheng-Hsin Hsu and Mohamed Hefeeda", title = "A framework for cross-layer optimization of video streaming in wireless networks", journal = j-TOMCCAP, volume = "7", number = "1", pages = "5:1--5:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870126", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chandra:2011:EAS, author = "Surendar Chandra and Xuwen Yu", title = "An empirical analysis of serendipitous media sharing among campus-wide wireless users", journal = j-TOMCCAP, volume = "7", number = "1", pages = "6:1--6:??", month = jan, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1870121.1870127", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:41 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gopinathan:2011:OLM, author = "Ajay Gopinathan and Zongpeng Li", title = "Optimal layered multicast", journal = j-TOMCCAP, volume = "7", number = "2", pages = "7:1--7:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925102", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hsu:2011:USS, author = "Cheng-Hsin Hsu and Mohamed Hefeeda", title = "Using simulcast and scalable video coding to efficiently control channel switching delay in mobile {TV} broadcast networks", journal = j-TOMCCAP, volume = "7", number = "2", pages = "8:1--8:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925103", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jin:2011:KDH, author = "Yohan Jin and Balakrishnan Prabhakaran", title = "Knowledge discovery from {$3$D} human motion streams through semantic dimensional reduction", journal = j-TOMCCAP, volume = "7", number = "2", pages = "9:1--9:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925104", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheng:2011:MPM, author = "Wei Cheng and Wei Tsang Ooi and Sebastien Mondet and Romulus Grigoras and G{\'e}raldine Morin", title = "Modeling progressive mesh streaming: {Does} data dependency matter?", journal = j-TOMCCAP, volume = "7", number = "2", pages = "10:1--10:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925105", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bagchi:2011:FAD, author = "Susmit Bagchi", title = "A fuzzy algorithm for dynamically adaptive multimedia streaming", journal = j-TOMCCAP, volume = "7", number = "2", pages = "11:1--11:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925106", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hsu:2011:SMV, author = "Cheng-Hsin Hsu and Mohamed Hefeeda", title = "Statistical multiplexing of variable-bit-rate videos streamed to mobile devices", journal = j-TOMCCAP, volume = "7", number = "2", pages = "12:1--12:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1925101.1925107", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Wed Mar 16 09:25:42 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2011:EN, author = "Ralf Steinmetz", title = "Editorial notice", journal = j-TOMCCAP, volume = "7", number = "3", pages = "13:1--13:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000487", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Korshunov:2011:VQF, author = "Pavel Korshunov and Wei Tsang Ooi", title = "Video quality for face detection, recognition, and tracking", journal = j-TOMCCAP, volume = "7", number = "3", pages = "14:1--14:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000488", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2011:PCI, author = "Pei-Yu Lin and Jung-San Lee and Chin-Chen Chang", title = "Protecting the content integrity of digital imagery with fidelity preservation", journal = j-TOMCCAP, volume = "7", number = "3", pages = "15:1--15:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000489", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{VanLeuken:2011:SVO, author = "Reinier H. {Van Leuken} and Remco C. Veltkamp", title = "Selecting vantage objects for similarity indexing", journal = j-TOMCCAP, volume = "7", number = "3", pages = "16:1--16:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000490", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Feng:2011:SRI, author = "Wu-Chi Feng and Thanh Dang and John Kassebaum and Tim Bauman", title = "Supporting region-of-interest cropping through constrained compression", journal = j-TOMCCAP, volume = "7", number = "3", pages = "17:1--17:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000491", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2011:DBA, author = "Qingzhong Liu and Andrew H. Sung and Mengyu Qiao", title = "Derivative-based audio steganalysis", journal = j-TOMCCAP, volume = "7", number = "3", pages = "18:1--18:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000492", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2011:GDO, author = "Frederick W. B. Li and Rynson W. H. Lau and Danny Kilis and Lewis W. F. Li", title = "Game-on-demand:: an online game engine based on geometry streaming", journal = j-TOMCCAP, volume = "7", number = "3", pages = "19:1--19:??", month = aug, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2000486.2000493", ISSN = "1551-6857 (print), 1551-6865 (electronic)", bibdate = "Mon Sep 5 17:00:22 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shirmohammadi:2011:IAM, author = "Shervin Shirmohammadi and Jiebo Luo and Jie Yang and Abdulmotaleb {El Saddik}", title = "Introduction to {ACM Multimedia 2010} best paper candidates", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "20:1--20:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037677", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bhattacharya:2011:HAA, author = "Subhabrata Bhattacharya and Rahul Sukthankar and Mubarak Shah", title = "A holistic approach to aesthetic enhancement of photographs", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "21:1--21:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037678", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tan:2011:URS, author = "Shulong Tan and Jiajun Bu and Chun Chen and Bin Xu and Can Wang and Xiaofei He", title = "Using rich social media information for music recommendation via hypergraph model", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "22:1--22:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037679", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Milani:2011:CAE, author = "Simone Milani and Giancarlo Calvagno", title = "A cognitive approach for effective coding and transmission of {$3$D} video", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "23:1--23:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037680", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hong:2011:VAE, author = "Richang Hong and Meng Wang and Xiao-Tong Yuan and Mengdi Xu and Jianguo Jiang and Shuicheng Yan and Tat-Seng Chua", title = "Video accessibility enhancement for hearing-impaired users", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "24:1--24:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037681", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Boll:2011:ISI, author = "Susanne Boll and Ramesh Jain and Jiebo Luo and Dong Xu", title = "Introduction to special issue on social media", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "25:1--25:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037682", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2011:EOM, author = "Yu-Ching Lin and Yi-Hsuan Yang and Homer H. Chen", title = "Exploiting online music tags for music emotion classification", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "26:1--26:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037683", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rabbath:2011:ACP, author = "Mohamad Rabbath and Philipp Sandhaus and Susanne Boll", title = "Automatic creation of photo books from stories in social media", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "27:1--27:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037684", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2011:RAI, author = "Weiming Hu and Haiqiang Zuo and Ou Wu and Yunfei Chen and Zhongfei Zhang and David Suter", title = "Recognition of adult images, videos, and web page bags", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "28:1--28:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037685", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2011:SSC, author = "Yu-Ru Lin and K. Sel{\c{c}}cuk Candan and Hari Sundaram and Lexing Xie", title = "{SCENT}: {Scalable} compressed monitoring of evolving multirelational social networks", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "29:1--29:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037686", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sang:2011:BCT, author = "Jitao Sang and Changsheng Xu", title = "Browse by chunks: {Topic} mining and organizing on web-scale social media", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "30:1--30:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037687", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ji:2011:MFL, author = "Rongrong Ji and Yue Gao and Bineng Zhong and Hongxun Yao and Qi Tian", title = "Mining {\tt flickr} landmarks by modeling reconstruction sparsity", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "31:1--31:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037688", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mandel:2011:CTI, author = "Michael I. Mandel and Razvan Pascanu and Douglas Eck and Yoshua Bengio and Luca M. Aiello and Rossano Schifanella and Filippo Menczer", title = "Contextual tag inference", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "32:1--32:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037689", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Biel:2011:VCB, author = "Joan-Isaac Biel and Daniel Gatica-Perez", title = "{VlogSense}: {Conversational} behavior and social attention in {YouTube}", journal = j-TOMCCAP, volume = "7S", number = "1", pages = "33:1--33:??", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2037676.2037690", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Nov 6 06:36:59 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Anonymous:2011:TCO, author = "Anonymous", title = "Table of Contents: Online Supplement Volume {7S}, Number 1", journal = j-TOMCCAP, volume = "7", number = "4", pages = "34:1--34:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043620", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hong:2011:BSE, author = "Richang Hong and Jinhui Tang and Hung-Khoon Tan and Chong-Wah Ngo and Shuicheng Yan and Tat-Seng Chua", title = "Beyond search: Event-driven summarization for {Web} videos", journal = j-TOMCCAP, volume = "7", number = "4", pages = "35:1--35:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043613", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kuo:2011:TPQ, author = "Wen-Kuang Kuo and Kuo-Wei Wu", title = "Traffic prediction and {QoS} transmission of real-time live {VBR} videos in {WLANs}", journal = j-TOMCCAP, volume = "7", number = "4", pages = "36:1--36:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043614", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Maddage:2011:BSS, author = "Namunu C. Maddage and Haizhou Li", title = "Beat space segmentation and octave scale cepstral feature for sung language recognition in pop music", journal = j-TOMCCAP, volume = "7", number = "4", pages = "37:1--37:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043615", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Santini:2011:ECQ, author = "Simone Santini", title = "Efficient computation of queries on feature streams", journal = j-TOMCCAP, volume = "7", number = "4", pages = "38:1--38:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043616", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Verdugo:2011:IFC, author = "Renato Verdugo and Miguel Nussbaum and Pablo Corro and Pablo Nu{\~n}nez and Paula Navarrete", title = "Interactive films and coconstruction", journal = j-TOMCCAP, volume = "7", number = "4", pages = "39:1--39:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043617", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghandeharizadeh:2011:DCC, author = "Shahram Ghandeharizadeh and Shahin Shayandeh", title = "Domical cooperative caching for streaming media in wireless home networks", journal = j-TOMCCAP, volume = "7", number = "4", pages = "40:1--40:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043618", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghandeharizadeh:2011:CPS, author = "Shahram Ghandeharizadeh and Shahin Shayandeh", title = "Call for papers: Special issue on {$3$D} mobile multimedia", journal = j-TOMCCAP, volume = "7", number = "4", pages = "41:1--41:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043612.2043619", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 15 08:53:32 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2012:ENC, author = "Ralf Steinmetz", title = "Editorial note and call for nominations: {Nicolas D. Georganas} best paper award", journal = j-TOMCCAP, volume = "8", number = "1", pages = "1:1--1:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071397", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghinea:2012:SSS, author = "Georghita Ghinea and Oluwakemi Ademoye", title = "The sweet smell of success: Enhancing multimedia applications with olfaction", journal = j-TOMCCAP, volume = "8", number = "1", pages = "2:1--2:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071398", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Olfaction, or smell, is one of the last challenges which multimedia applications have to conquer. As far as computerized smell is concerned, there are several difficulties to overcome, particularly those associated with the ambient nature of smell. In this article, we present results from an empirical study exploring users' perception of olfaction-enhanced multimedia displays. Findings show that olfaction significantly adds to the user multimedia experience. Moreover, use of olfaction leads to an increased sense of reality and relevance. Our results also show that users are tolerant of the interference and distortion effects caused by olfactory effect in multimedia.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hefeeda:2012:DET, author = "Mohamed Hefeeda and Cheng-Hsin Hsu", title = "Design and evaluation of a testbed for mobile {TV} networks", journal = j-TOMCCAP, volume = "8", number = "1", pages = "3:1--3:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071399", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents the design of a complete, open-source, testbed for broadcast networks that offer mobile TV services. Although basic architectures and protocols have been developed for such networks, detailed performance tuning and analysis are still needed, especially when these networks scale to serve many diverse TV channels to numerous subscribers. The detailed performance analysis could also motivate designing new protocols and algorithms for enhancing future mobile TV networks. Currently, many researchers evaluate the performance of mobile TV networks using simulation and/or theoretical modeling methods. These methods, while useful for early assessment, typically abstract away many necessary details of actual, fairly complex, networks. Therefore, an open-source platform for evaluating new ideas in a real mobile TV network is needed. This platform is currently not possible with commercial products, because they are sold as black boxes without the source code. In this article, we summarize our experiences in designing and implementing a testbed for mobile TV networks. We integrate off-the-shelf hardware components with carefully designed software modules to realize a scalable testbed that covers almost all aspects of real networks. We use our testbed to empirically analyze various performance aspects of mobile TV networks and validate/refute several claims made in the literature as well as discover/quantify multiple important performance tradeoffs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2012:DMS, author = "Yu-Ru Lin and Hari Sundaram and Munmun {De Choudhury} and Aisling Kelliher", title = "Discovering multirelational structure in social media streams", journal = j-TOMCCAP, volume = "8", number = "1", pages = "4:1--4:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071400", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we present a novel algorithm to discover multirelational structures from social media streams. A media item such as a photograph exists as part of a meaningful interrelationship among several attributes, including time, visual content, users, and actions. Discovery of such relational structures enables us to understand the semantics of human activity and has applications in content organization, recommendation algorithms, and exploratory social network analysis. We are proposing a novel nonnegative matrix factorization framework to characterize relational structures of group photo streams. The factorization incorporates image content features and contextual information. The idea is to consider a cluster as having similar relational patterns; each cluster consists of photos relating to similar content or context. Relations represent different aspects of the photo stream data, including visual content, associated tags, photo owners, and post times. The extracted structures minimize the mutual information of the predicted joint distribution. We also introduce a relational modularity function to determine the structure cost penalty, and hence determine the number of clusters. Extensive experiments on a large Flickr dataset suggest that our approach is able to extract meaningful relational patterns from group photo streams. We evaluate the utility of the discovered structures through a tag prediction task and through a user study. Our results show that our method based on relational structures, outperforms baseline methods, including feature and tag frequency based techniques, by 35\%--420\%. We have conducted a qualitative user study to evaluate the benefits of our framework in exploring group photo streams. The study indicates that users found the extracted clustering results clearly represent major themes in a group; the clustering results not only reflect how users describe the group data but often lead the users to discover the evolution of the group activity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheng:2012:EIC, author = "Xu Cheng and Jiangchuan Liu", title = "Exploring interest correlation for peer-to-peer socialized video sharing", journal = j-TOMCCAP, volume = "8", number = "1", pages = "5:1--5:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071401", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The last five years have witnessed an explosion of networked video sharing, represented by YouTube, as a new killer Internet application. Their sustainable development however is severely hindered by the intrinsic limit of their client/server architecture. A shift to the peer-to-peer paradigm has been widely suggested with success already shown in live video streaming and movie-on-demand. Unfortunately, our latest measurement demonstrates that short video clips exhibit drastically different statistics, which would simply render these existing solutions suboptimal, if not entirely inapplicable. Our long-term measurement over five million YouTube videos, on the other hand, reveals interesting social networks with strong correlation among the videos, thus opening new opportunities to explore. In this article, we present NetTube, a novel peer-to-peer assisted delivering framework that explores the user interest correlation for short video sharing. We address a series of key design issues to realize the system, including a bi-layer overlay, an efficient indexing scheme, a delay-aware scheduling mechanism, and a prefetching strategy leveraging interest correlation. We evaluate NetTube through both simulations and prototype experiments, which show that it greatly reduces the server workload, improves the playback quality and scales well.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mei:2012:ITC, author = "Tao Mei and Lusong Li and Xian-Sheng Hua and Shipeng Li", title = "{ImageSense}: Towards contextual image advertising", journal = j-TOMCCAP, volume = "8", number = "1", pages = "6:1--6:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071402", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The daunting volumes of community-contributed media contents on the Internet have become one of the primary sources for online advertising. However, conventional advertising treats image and video advertising as general text advertising by displaying relevant ads based on the contents of the Web page, without considering the inherent characteristics of visual contents. This article presents a contextual advertising system driven by images, which automatically associates relevant ads with an image rather than the entire text in a Web page and seamlessly inserts the ads in the nonintrusive areas within each individual image. The proposed system, called ImageSense, supports scalable advertising of, from root to node, Web sites, pages, and images. In ImageSense, the ads are selected based on not only textual relevance but also visual similarity, so that the ads yield contextual relevance to both the text in the Web page and the image content. The ad insertion positions are detected based on image salience, as well as face and text detection, to minimize intrusiveness to the user. We evaluate ImageSense on a large-scale real-world images and Web pages, and demonstrate the effectiveness of ImageSense for online image advertising.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Snidaro:2012:FMV, author = "Lauro Snidaro and Ingrid Visentini and Gian Luca Foresti", title = "Fusing multiple video sensors for surveillance", journal = j-TOMCCAP, volume = "8", number = "1", pages = "7:1--7:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071403", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Real-time detection, tracking, recognition, and activity understanding of moving objects from multiple sensors represent fundamental issues to be solved in order to develop surveillance systems that are able to autonomously monitor wide and complex environments. The algorithms that are needed span therefore from image processing to event detection and behaviour understanding, and each of them requires dedicated study and research. In this context, sensor fusion plays a pivotal role in managing the information and improving system performance. Here we present a novel fusion framework for combining the data coming from multiple and possibly heterogeneous sensors observing a surveillance area.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2012:TAM, author = "Jiun-Long Huang and Shih-Chuan Chiu and Man-Kwan Shan", title = "Towards an automatic music arrangement framework using score reduction", journal = j-TOMCCAP, volume = "8", number = "1", pages = "8:1--8:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2071396.2071404", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:02 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Score reduction is a process that arranges music for a target instrument by reducing original music. In this study we present a music arrangement framework that uses score reduction to automatically arrange music for a target instrument. The original music is first analyzed to determine the type of arrangement element of each section, then the phrases are identified and each is assigned a utility according to its type of arrangement element. For a set of utility-assigned phrases, we transform the music arrangement into an optimization problem and propose a phrase selection algorithm. The music is arranged by selecting appropriate phrases satisfying the playability constraints of a target instrument. Using the proposed framework, we implement a music arrangement system for the piano. An approach similar to Turing test is used to evaluate the quality of the music arranged by our system. The experiment results show that our system is able to create viable music for the piano.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2012:EN, author = "Ralf Steinmetz", title = "Editorial note", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "9:1--9:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089086", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2012:BET, author = "Dongyu Liu and Fei Li and Bo Shen and Songqing Chen", title = "Building an efficient transcoding overlay for {P2P} streaming to heterogeneous devices", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "10:1--10:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089087", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the increasing deployment of Internet P2P/overlay streaming systems, more and more clients use mobile devices, such as smart phones and PDAs, to access these Internet streaming services. Compared to wired desktops, mobile devices normally have a smaller screen size, a less color depth, and lower bandwidth and thus cannot correctly and effectively render and display the data streamed to desktops. To address this problem, in this paper, we propose PAT (Peer-Assisted Transcoding) to enable effective online transcoding in P2P/overlay streaming. PAT has the following unique features. First, it leverages active peer cooperation without demanding infrastructure support such as transcoding servers. Second, as online transcoding is computationally intensive while the various devices used by participating clients may have limited computing power and related resources (e.g., battery, bandwidth), an additional overlay, called metadata overlay, is constructed to instantly share the intermediate transcoding result of a transcoding procedure with other transcoding nodes to minimize the total computing overhead in the system. The experimental results collected within a realistically simulated testbed show that by consuming 6\% extra bandwidth, PAT could save up to 58\% CPU cycles for online transcoding.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2012:IFP, author = "Zhijie Shen and Roger Zimmermann", title = "{ISP}-friendly {P2P} live streaming: a roadmap to realization", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "11:1--11:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089088", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Peer-to-Peer (P2P) applications generate large amounts of Internet network traffic. The wide-reaching connectivity of P2P systems is creating resource inefficiencies for network providers. Recent studies have demonstrated that localizing cross-ISP (Internet service provider) traffic can mitigate this challenge. However, bandwidth sensitivity and display quality requirements complicate the ISP-friendly design for live streaming systems. To this date, although some prior techniques focusing on live streaming systems exist, the correlation between traffic localization and streaming quality guarantee has not been well explored. Additionally, the proposed solutions are often not easy to apply in practice. In our presented work, we demonstrate that the cross-ISP traffic of P2P live streaming systems can be significantly reduced with little impact on the streaming quality. First, we analytically investigate and quantify the tradeoff between traffic localization and streaming quality guarantee, determining the lower bound of the inter-AS (autonomous system) streaming rate below which streaming quality cannot be preserved. Based on the analysis, we further propose a practical ISP-friendly solution, termed IFPS, which requires only minor changes to the peer selection mechanism and can easily be integrated into both new and existing systems. Additionally, the significant opportunity for localizing traffic is underscored by our collected traces from PPLive, which also enabled us to derive realistic parameters to guide our simulations. The experimental results demonstrate that IFPS reduces cross-ISP traffic from 81\% up to 98\% while keeping streaming quality virtually unaffected.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lou:2012:QDD, author = "Xiaosong Lou and Kai Hwang", title = "Quality of data delivery in peer-to-peer video streaming", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "12:1--12:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089089", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "QoS in a P2P video streaming system is evaluated in three stages: content generation, data delivery and video playback. We use jitter-free probability as the main performance metric to study Quality of Data delivery (QoD). A new model that incorporates both bandwidth and data availability of P2P network is proposed. Our model relies on a sharing factor that models data availability among all peers. We simulate on a minimalistic network to demonstrate how to apply the analytical model to design a P2P video streaming system with a very low jitter rate. Our simulation experimental results reveal that the lower bound on jitter-free probability is indeed effective to reflect the QoD of the entire system. Our model captures the impact of many design choices, including upload bandwidth limit, peer selection strategies, and video stream chunking schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2012:DNW, author = "Chuan Wu and Baochun Li and Shuqiao Zhao", title = "Diagnosing network-wide {P2P} live streaming inefficiencies", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "13:1--13:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089090", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Large-scale live peer-to-peer (P2P) streaming applications have been successfully deployed in today's Internet. While they can accommodate hundreds of thousands of users simultaneously with hundreds of channels of programming, there still commonly exist channels and times where and when the streaming quality is unsatisfactory. In this paper, based on more than two terabytes and one year worth of live traces from UUSee, a large-scale commercial P2P live streaming system, we show an in-depth network-wide diagnosis of streaming inefficiencies, commonly present in typical mesh-based P2P live streaming systems. As the first highlight of our work, we identify an evolutionary pattern of low streaming quality in the system, and the distribution of streaming inefficiencies across various streaming channels and in different geographical regions. We then carry out an extensive investigation to explore the causes to such streaming inefficiencies over different times and across different channels/regions at specific times, by investigating the impact of factors such as the number of peers, peer upload bandwidth, inter-peer bandwidth availability, server bandwidth consumption, and many more. The original discoveries we have brought forward include the two-sided effects of peer population on the streaming quality in a streaming channel, the significant impact of inter-peer bandwidth bottlenecks at peak times, and the inefficient utilization of server capacities across concurrent channels. Based on these insights, we identify problems within the existing P2P live streaming design and discuss a number of suggestions to improve real-world streaming protocols operating at a large scale.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2012:ABP, author = "Chuan Wu and Zongpeng Li and Xuanjia Qiu and Francis C. M. Lau", title = "Auction-based {P2P VoD} streaming: Incentives and optimal scheduling", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "14:1--14:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089091", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Real-world large-scale Peer-to-Peer (P2P) Video-on-Demand (VoD) streaming applications face more design challenges as compared to P2P live streaming, due to higher peer dynamics and less buffer overlap. The situation is further complicated when we consider the selfish nature of peers, who in general wish to download more and upload less, unless otherwise motivated. Taking a new perspective of distributed dynamic auctions, we design efficient P2P VoD streaming algorithms with simultaneous consideration of peer incentives and streaming optimality. In our solution, media block exchanges among peers are carried out through local auctions, in which budget-constrained peers bid for desired blocks from their neighbors, which in turn deliver blocks to the winning bidders and collect revenue. With strategic design of a discriminative second price auction with seller reservation, a supplying peer has full incentive to maximally contribute its bandwidth to increase its budget; requesting peers are also motivated to bid in such a way that optimal media block scheduling is achieved effectively in a fully decentralized fashion. Applying techniques from convex optimization and mechanism design, we prove (a) the incentive compatibility at the selling and buying peers, and (b) the optimality of the induced media block scheduling in terms of social welfare maximization. Large-scale empirical studies are conducted to investigate the behavior of the proposed auction mechanisms in dynamic P2P VoD systems based on real-world settings.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2012:PHL, author = "Tieying Zhang and Xueqi Cheng and Jianming Lv and Zhenhua Li and Weisong Shi", title = "Providing hierarchical lookup service for {P2P--VoD} systems", journal = j-TOMCCAP, volume = "8s", number = "1", pages = "15:1--15:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2089085.2089092", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Mar 16 15:56:04 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Supporting random jump in P2P-VoD systems requires efficient lookup for the `best' suppliers, where `best' means the suppliers should meet two requirements: content match and network quality match. Most studies use a DHT-based method to provide content lookup; however, these methods are neither able to meet the network quality requirements nor suitable for VoD streaming due to the large overhead. In this paper, we propose Mediacoop, a novel hierarchical lookup scheme combining both content and quality match to provide random jumps for P2P-VoD systems. It exploits the play position to efficiently locate the candidate suppliers with required data (content match), and performs refined lookup within the candidates to meet quality match. Theoretical analysis and simulation results show that Mediacoop is able to achieve lower jump latency and control overhead than the typical DHT-based method. Moreover, we implement Mediacoop in a BitTorrent-like P2P-VoD system called CoolFish and make optimizations for such ` total cache' applications. The implementation and evaluation in CoolFish show that Mediacoop is able to improve user experiences, especially the jump latency, which verifies the practicability of our design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Anonymous:2012:TCO, author = "Anonymous", title = "Table of Contents: Online Supplement Volume {8S}, Number 1", journal = j-TOMCCAP, volume = "8", number = "2", pages = "16:1--16:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2169004", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dornaika:2012:IRF, author = "Fadi Dornaika and James H. Elder", title = "Image registration for foveated panoramic sensing", journal = j-TOMCCAP, volume = "8", number = "2", pages = "17:1--17:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2168997", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article addresses the problem of registering high-resolution, small field-of-view images with low-resolution panoramic images provided by a panoramic catadioptric video sensor. Such systems may find application in surveillance and telepresence systems that require a large field of view and high resolution at selected locations. Although image registration has been studied in more conventional applications, the problem of registering panoramic and conventional video has not previously been addressed, and this problem presents unique challenges due to (i) the extreme differences in resolution between the sensors (more than a 16:1 linear resolution ratio in our application), and (ii) the resolution inhomogeneity of panoramic images. The main contributions of this article are as follows. First, we introduce our foveated panoramic sensor design. Second, we show how a coarse registration can be computed from the raw images using parametric template matching techniques. Third, we propose two refinement methods allowing automatic and near real-time registration between the two image streams. The first registration method is based on matching extracted interest points using a closed form method. The second registration method is featureless and based on minimizing the intensity discrepancy allowing the direct recovery of both the geometric and the photometric transforms. Fourth, a comparison between the two registration methods is carried out, which shows that the featureless method is superior in accuracy. Registration examples using the developed methods are presented.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2012:CPC, author = "Xin Zhang and Tom{\'a}s Ward and S{\'e}amus Mcloone", title = "Comparison of predictive contract mechanisms from an information theory perspective", journal = j-TOMCCAP, volume = "8", number = "2", pages = "18:1--18:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2168998", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Inconsistency arises across a Distributed Virtual Environment due to network latency induced by state changes communications. Predictive Contract Mechanisms (PCMs) combat this problem through reducing the amount of messages transmitted in return for perceptually tolerable inconsistency. To date there are no methods to quantify the efficiency of PCMs in communicating this reduced state information. This article presents an approach derived from concepts in information theory for a deeper understanding of PCMs. Through a comparison of representative PCMs, the worked analysis illustrates interesting aspects of PCMs operation and demonstrates how they can be interpreted as a form of lossy information compression.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Olsen:2012:ITN, author = "Dan R. Olsen and Derek Bunn and Trent Boulter and Robert Walz", title = "Interactive television news", journal = j-TOMCCAP, volume = "8", number = "2", pages = "19:1--19:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2168999", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A new interactive television experience has been created for watching television news. The goal is to create a news experience that is similar to the way people watch television in their living rooms while giving viewers the power to make choices about what they see. We partnered with existing news organizations to create tools consistent with current news production practices. The viewer experience allows selection of the order of news content, skipping unwanted content and exploring stories in more depth. These tools were used to produce seven days of interactive commercial news that were viewed in ten homes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Armitage:2012:ROF, author = "Grenville Armitage and Amiel Heyde", title = "{REED}: {Optimizing} first person shooter game server discovery using network coordinates", journal = j-TOMCCAP, volume = "8", number = "2", pages = "20:1--20:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2169000", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online First Person Shooter (FPS) games typically use a client-server communication model, with thousands of enthusiast-hosted game servers active at any time. Traditional FPS server discovery may take minutes, as clients create thousands of short-lived packet flows while probing all available servers to find a selection of game servers with tolerable round trip time (RTT). REED reduces a client's probing time and network traffic to 1\% of traditional server discovery. REED game servers participate in a centralized, incremental calculation of their network coordinates, and clients use these coordinates to expedite the discovery of servers with low RTTs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2012:ILC, author = "Xiaobai Liu and Shuicheng Yan and Tat-Seng Chua and Hai Jin", title = "Image label completion by pursuing contextual decomposability", journal = j-TOMCCAP, volume = "8", number = "2", pages = "21:1--21:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2169001", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article investigates how to automatically complete the missing labels for the partially annotated images, without image segmentation. The label completion procedure is formulated as a nonnegative data factorization problem, to decompose the global image representations that are used for describing the entire images, for instance, various image feature descriptors, into their corresponding label representations, that are used for describing the local semantic regions within images. The solution provided in this work is motivated by following observations. First, label representations of the regions with the same label often share certain commonness, yet may be essentially different due to the large intraclass variations. Thus, each label or concept should be represented by using a subspace spanned by an ensemble of basis, instead of a single one, to characterize the intralabel diversities. Second, the subspaces for different labels are different from each other. Third, while two images are similar with each other, the corresponding label representations should be similar. We formulate this cross-image context as well as the given partial label annotations in the framework of nonnegative data factorization and then propose an efficient multiplicative nonnegative update rules to alternately optimize the subspaces and the reconstruction coefficients. We also provide the theoretic proof of algorithmic convergence and correctness. Extensive experiments over several challenging image datasets clearly demonstrate the effectiveness of our proposed solution in boosting the quality of image label completion and image annotation accuracy. Based on the same formulation, we further develop a label ranking algorithms, to refine the noised image labels without any manual supervision. We compare the proposed label ranking algorithm with the state-of-the-arts over the popular evaluation databases and achieve encouragingly improvements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2012:SGU, author = "Yi Chen and Abhidnya A. Deshpande and Ramazan S. Ayg{\"u}un", title = "Sprite generation using sprite fusion", journal = j-TOMCCAP, volume = "8", number = "2", pages = "22:1--22:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2169002", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "There has been related research for sprite or mosaic generation for over 15 years. In this article, we try to understand the methodologies for sprite generation and identify what has not actually been covered for sprite generation. We first identify issues and focus on the domain of videos for sprite generation. We introduce a novel sprite fusion method that blends two sprites. Sprite fusion method produces good results for tracking videos and does not require object segmentation. We present sample results of our experiments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Weng:2012:CVR, author = "Ming-Fang Weng and Yung-Yu Chuang", title = "Collaborative video reindexing via matrix factorization", journal = j-TOMCCAP, volume = "8", number = "2", pages = "23:1--23:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2168996.2169003", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:03 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Concept-based video indexing generates a matrix of scores predicting the possibilities of concepts occurring in video shots. Based on the idea of collaborative filtering, this article presents unsupervised methods to refine the initial scores generated by concept classifiers by taking into account the concept-to-concept correlation and shot-to-shot similarity embedded within the score matrix. Given a noisy matrix, we refine the inaccurate scores via matrix factorization. This method is further improved by learning multiple local models and incorporating contextual-temporal structures. Experiments on the TRECVID 2006--2008 datasets demonstrate relative performance gains ranging from 13\% to 52\% without using any user annotations or external knowledge resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kankanhalli:2012:ISI, author = "Mohan S. Kankanhalli", title = "Introduction to special issue on multimedia security", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "31:1--31:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344437", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Weir:2012:IHV, author = "Jonathan Weir and Weiqi Yan and Mohan S. Kankanhalli", title = "Image hatching for visual cryptography", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "32:1--32:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344438", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image hatching (or nonphotorealistic line-art) is a technique widely used in the printing or engraving of currency. Diverse styles of brush strokes have previously been adopted for different areas of an image to create aesthetically pleasing textures and shading. Because there is no continuous tone within these types of images, a multilevel scheme is proposed, which uses different textures based on a threshold level. These textures are then applied to the different levels and are then combined to build up the final hatched image. The proposed technique allows a secret to be hidden using Visual Cryptography (VC) within the hatched images. Visual cryptography provides a very powerful means by which one secret can be distributed into two or more pieces known as shares. When the shares are superimposed exactly together, the original secret can be recovered without computation. Also provided is a comparison between the original grayscale images and the resulting hatched images that are generated by the proposed algorithm. This reinforces that the overall quality of the hatched scheme is sufficient. The Structural SIMilarity index (SSIM) is used to perform this comparison.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2012:RIB, author = "Jian Li and Hongmei Liu and Jiwu Huang and Yun Q. Shi", title = "Reference index-based {H.264} video watermarking scheme", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "33:1--33:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344439", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video watermarking has received much attention over the past years as a promising solution to copy protection. Watermark robustness is still a key issue of research, especially when a watermark is embedded in the compressed video domain. In this article, a robust watermarking scheme for H.264 video is proposed. During video encoding, the watermark is embedded in the index of the reference frame, referred to as reference index, a bitstream syntax element newly proposed in the H.264 standard. Furthermore, the video content (current coded blocks) is modified based on an optimization model, aiming at improving watermark robustness without unacceptably degrading the video's visual quality or increasing the video's bit rate. Compared with the existing schemes, our method has the following three advantages: (1) The bit rate of the watermarked video is adjustable; (2) the robustness against common video operations can be achieved; (3) the watermark embedding and extraction are simple. Extensive experiments have verified the good performance of the proposed watermarking scheme.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gao:2012:RHC, author = "Xifeng Gao and Caiming Zhang and Yan Huang and Zhigang Deng", title = "A robust high-capacity affine-transformation-invariant scheme for watermarking {$3$D} geometric models", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "34:1--34:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344440", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article we propose a novel, robust, and high-capacity watermarking method for 3D meshes with arbitrary connectivities in the spatial domain based on affine invariants. Given a 3D mesh model, a watermark is embedded as affine-invariant length ratios of one diagonal segment to the residing diagonal intersected by the other one in a coplanar convex quadrilateral. In the extraction process, a watermark is recovered by combining all the watermark pieces embedded in length ratios through majority voting. Extensive experimental results demonstrate the robustness, high computational efficiency, high capacity, and affine-transformation-invariant characteristics of the proposed approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2012:EMA, author = "Rui Yang and Zhenhua Qu and Jiwu Huang", title = "Exposing {MP3} audio forgeries using frame offsets", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "35:1--35:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344441", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Audio recordings should be authenticated before they are used as evidence. Although audio watermarking and signature are widely applied for authentication, these two techniques require accessing the original audio before it is published. Passive authentication is necessary for digital audio, especially for the most popular audio format: MP3. In this article, we propose a passive approach to detect forgeries of MP3 audio. During the process of MP3 encoding the audio samples are divided into frames, and thus each frame has its own frame offset after encoding. Forgeries lead to the breaking of framing grids. So the frame offset is a good indication for locating forgeries, and it can be retrieved by the identification of the quantization characteristic. In this way, the doctored positions can be automatically located. Experimental results demonstrate that the proposed approach is effective in detecting some common forgeries, such as deletion, insertion, substitution, and splicing. Even when the bit rate is as low as 32 kbps, the detection rate is above 99\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Feng:2012:CAO, author = "Hui Feng and Hefei Ling and Fuhao Zou and Weiqi Yan and Zhengding Lu", title = "A collusion attack optimization strategy for digital fingerprinting", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "36:1--36:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344442", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Collusion attack is a cost-efficient attack for digital fingerprinting. In this article, we propose a novel collusion attack strategy, Iterative Optimization Collusion Attack (IOCA), which is based upon the gradient attack and the principle of informed watermark embedding. We evaluate the performance of the proposed collusion attack strategy in defeating four typical fingerprinting schemes under a well-constructed evaluation framework. The simulation results show that the proposed strategy performs more effectively than the gradient attack, and adopting no more than three fingerprinted copies can sufficiently collapse examined fingerprinting schemes. Meanwhile, the content resulted from the proposed attack still preserves high perceptual quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sachan:2012:ALV, author = "Amit Sachan and Sabu Emmanuel and Mohan S. Kankanhalli", title = "Aggregate licenses validation for digital rights violation detection", journal = j-TOMCCAP, volume = "8", number = "2S", pages = "37:1--37:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2344436.2344443", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:05 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Digital Rights Management (DRM) is the term associated with the set of technologies to prevent illegal multimedia content distribution and consumption. DRM systems generally involve multiple parties such as owner, distributors, and consumers. The owner issues redistribution licenses to its distributors. The distributors in turn using their received redistribution licenses can generate and issue new redistribution licenses to other distributors and new usage licenses to consumers. As a part of rights violation detection, these newly generated licenses must be validated by a validation authority against the redistribution license used to generate them. The validation of these newly generated licenses becomes quite complex when there exist multiple redistribution licenses for a media with the distributors. In such cases, the validation process requires validation using an exponential number (to the number of redistribution licenses) of validation inequalities and each validation inequality may contain up to an exponential number of summation terms. This makes the validation process computationally intensive and necessitates to do the validation efficiently. To overcome this, we propose validation tree, a prefix-tree-based validation method to do the validation efficiently. Theoretical analysis and experimental results show that our proposed technique reduces the validation time significantly.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Riiser:2012:VSU, author = "Haakon Riiser and Tore Endestad and Paul Vigmostad and Carsten Griwodz and P{\^a}l Halvorsen", title = "Video streaming using a location-based bandwidth-lookup service for bitrate planning", journal = j-TOMCCAP, volume = "8", number = "3", pages = "24:1--24:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A lot of people around the world commute using public transportation and would like to spend this time viewing streamed video content such as news or sports updates. However, mobile wireless networks typically suffer from severe bandwidth fluctuations, and the networks are often completely unresponsive for several seconds, sometimes minutes. Today, there are several ways of adapting the video bitrate and thus the video quality to such fluctuations, for example, using scalable video codecs or segmented adaptive HTTP streaming that switches between nonscalable video streams encoded in different bitrates. Still, for a better long-term video playout experience that avoids disruptions and frequent quality changes while using existing video adaptation technology, it is desirable to perform bandwidth prediction and planned quality adaptation. This article describes a video streaming system for receivers equipped with a GPS. A receiver's download rate is constantly monitored, and periodically reported back to a central database along with associated GPS positional data. Thus, based on the current location, a streaming device can use a GPS-based bandwidth-lookup service in order to better predict the near-future bandwidth availability and create a schedule for the video playout that takes likely future availability into account. To create a prototype and perform initial tests, we conducted several field trials while commuting using public transportation. We show how our database has been used to predict bandwidth fluctuations and network outages, and how this information helps maintain uninterrupted playback with less compromise on video quality than possible without prediction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Valdes:2012:AEV, author = "Victor Valdes and Jose M. Martinez", title = "Automatic evaluation of video summaries", journal = j-TOMCCAP, volume = "8", number = "3", pages = "25:1--25:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240138", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article describes a method for the automatic evaluation of video summaries based on the training of individual predictors for different quality measures from the TRECVid 2008 BBC Rushes Summarization Task. The obtained results demonstrate that, with a large set of evaluation data, it is possible to train fully automatic evaluation systems based on visual features automatically extracted from the summaries. The proposed approach will enable faster and easier estimation of the results of newly developed abstraction algorithms and the study of which summary characteristics influence their perceived quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tian:2012:STL, author = "Xinmei Tian and Dacheng Tao and Yong Rui", title = "Sparse transfer learning for interactive video search reranking", journal = j-TOMCCAP, volume = "8", number = "3", pages = "26:1--26:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240139", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Visual reranking is effective to improve the performance of the text-based video search. However, existing reranking algorithms can only achieve limited improvement because of the well-known semantic gap between low-level visual features and high-level semantic concepts. In this article, we adopt interactive video search reranking to bridge the semantic gap by introducing user's labeling effort. We propose a novel dimension reduction tool, termed sparse transfer learning (STL), to effectively and efficiently encode user's labeling information. STL is particularly designed for interactive video search reranking. Technically, it (a) considers the pair-wise discriminative information to maximally separate labeled query relevant samples from labeled query irrelevant ones, (b) achieves a sparse representation for the subspace to encodes user's intention by applying the elastic net penalty, and (c) propagates user's labeling information from labeled samples to unlabeled samples by using the data distribution knowledge. We conducted extensive experiments on the TRECVID 2005, 2006 and 2007 benchmark datasets and compared STL with popular dimension reduction algorithms. We report superior performance by using the proposed STL-based interactive video search reranking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2012:IBD, author = "Xin Zhang and Tom{\'a}s E. Ward and S{\'e}amus Mcloone", title = "An information-based dynamic extrapolation model for networked virtual environments", journal = j-TOMCCAP, volume = "8", number = "3", pages = "27:1--27:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Various Information Management techniques have been developed to help maintain a consistent shared virtual world in a Networked Virtual Environment. However, such techniques have to be carefully adapted to the application state dynamics and the underlying network. This work presents a novel framework that minimizes inconsistency by optimizing bandwidth usage to deliver useful information. This framework measures the state evolution using an information model and dynamically switches extrapolation models and the packet rate to make the most information-efficient usage of the available bandwidth. The results shown demonstrate that this approach can help optimize consistency under constrained and time-varying network conditions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2012:UCM, author = "Linjun Yang and Bo Geng and Alan Hanjalic and Xian-Sheng Hua", title = "A unified context model for web image retrieval", journal = j-TOMCCAP, volume = "8", number = "3", pages = "28:1--28:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240141", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Content-based web image retrieval based on the query-by-example (QBE) principle remains a challenging problem due to the semantic gap as well as the gap between a user's intent and the representativeness of a typical image query. In this article, we propose to address this problem by integrating query-related contextual information into an advanced query model to improve the performance of QBE-based web image retrieval. We consider both the local and global context of the query image. The local context can be inferred from the web pages and the click-through log associated with the query image, while the global context is derived from the entire corpus comprising all web images and the associated web pages. To effectively incorporate the local query context we propose a language modeling based approach to deal with the combined structured query representation from the contextual and visual information. The global query context is integrated by the multi-modal relevance model to ``reconstruct'' the query from the document models indexed in the corpus. In this way, the global query context is employed to address the noise or missing information in the query and its local context, so that a comprehensive and robust query model can be obtained. We evaluated the proposed approach on a representative product image dataset collected from the web and demonstrated that the inclusion of the local and global query contexts significantly improves the performance of QBE-based web image retrieval.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Patras:2012:CTS, author = "Paul Patras and Albert Banchs and Pablo Serrano", title = "A control theoretic scheme for efficient video transmission over {IEEE 802.11e EDCA WLANs}", journal = j-TOMCCAP, volume = "8", number = "3", pages = "29:1--29:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240142", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The EDCA mechanism of the IEEE 802.11 standard has been designed to support, among others, video traffic. This mechanism relies on a number of parameters whose configuration is left open by the standard. Although there are some recommended values for these parameters, they are fixed independent of the WLAN conditions, which results in suboptimal performance. Following this observation, a number of approaches in the literature have been devised to set the EDCA parameters based on an estimation of the WLAN conditions. However, these previous approaches are based on heuristics and hence do not guarantee optimized performance. In this article we propose a novel algorithm to adjust the EDCA parameters to carry video traffic which, in contrast to previous approaches, is sustained on mathematical foundations that guarantee optimal performance. In particular, our approach builds upon (i) an analytical model of the WLAN performance under video traffic, used to derive the optimal point of operation of EDCA, and (ii) a control theoretic designed mechanism which drives the WLAN to this point of operation. Via extensive simulations, we show that the proposed approach performs optimally and substantially outperforms the standard recommended configuration as well as previous adaptive proposals.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhu:2012:JLS, author = "Xinglei Zhu and Chang W. Chen", title = "A joint layered scheme for reliable and secure mobile {JPEG-2000} streaming", journal = j-TOMCCAP, volume = "8", number = "3", pages = "30:1--30:??", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2240136.2240143", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:06 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel joint layered approach to simultaneously achieve both reliable and secure mobile JPEG-2000 image streaming. With a priori knowledge of JPEG-2000 source coding and channel coding, the proposed joint system integrates authentication into the media error protection components to ensure that every source-decodable media unit is authenticated. By such a dedicated design, the proposed scheme protects both compressed JPEG-2000 codestream and the authentication data from wireless channel impairments. It is fundamentally different from many existing systems that consider the problem of media authentication separately from the other operations in the media transmission system. By utilizing the contextual relationship, such as coding dependency and content importance between media slices for authentication hash appending, the proposed scheme generates an extremely low authentication overhead. Under this joint layered coding framework, an optimal rate allocation algorithm for source coding, channel coding, and media authentication is developed to guarantee end-to-end media quality. Experiment results on JPEG-2000 images validate the proposed scheme and demonstrate that the performance of the proposed scheme is approaching its upper bound, in which case no authentication is applied to the media stream.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gatica-Perez:2012:ISS, author = "Daniel Gatica-Perez and Gang Hua and Wei Tsang Ooi and P{\aa}l Halvorsen", title = "Introduction to the special section of best papers of {ACM Multimedia 2011}", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "38:1--38:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348817", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2012:CPA, author = "Wanmin Wu and Ahsan Arefin and Gregorij Kurillo and Pooja Agarwal and Klara Nahrstedt and Ruzena Bajcsy", title = "{CZLoD}: a psychophysical approach for {$3$D} tele-immersive video", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "39:1--39:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348818", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a psychophysical study that measures the perceptual thresholds of a new factor called Color-plus-Depth Level-of-Details (CZLoD) peculiar to polygon-based 3D tele-immersive video. The results demonstrate the existence of Just Noticeable Degradation and Just Unacceptable Degradation thresholds on the factor. In light of the results, we design and implement a real-time perception-based quality adaptor for 3D tele-immersive video. Our experimental results show that the adaptation scheme can reduce resource usage (e.g., CPU cycles) while considerably enhancing the overall perceived visual quality. Our analysis confirms the potential temporal and spatial performance benefits achievable with CZLoD adaptation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ji:2012:AQS, author = "Rongrong Ji and Felix X. Yu and Tongtao Zhang and Shih-Fu Chang", title = "Active query sensing: {Suggesting} the best query view for mobile visual search", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "40:1--40:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348819", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While much exciting progress is being made in mobile visual search, one important question has been left unexplored in all current systems. When searching objects or scenes in the 3D world, which viewing angle is more likely to be successful? More particularly, if the first query fails to find the right target, how should the user control the mobile camera to form the second query? In this article, we propose a novel Active Query Sensing system for mobile location search, which actively suggests the best subsequent query view to recognize the physical location in the mobile environment. The proposed system includes two unique components: (1) an offline process for analyzing the saliencies of different views associated with each geographical location, which predicts the location search precisions of individual views by modeling their self-retrieval score distributions. (2) an online process for estimating the view of an unseen query, and suggesting the best subsequent view change. Specifically, the optimal viewing angle change for the next query can be formulated as an online information theoretic approach. Using a scalable visual search system implemented over a NYC street view dataset (0.3 million images), we show a performance gain by reducing the failure rate of mobile location search to only 12\% after the second query. We have also implemented an end-to-end functional system, including user interfaces on iPhones, client-server communication, and a remote search server. This work may open up an exciting new direction for developing interactive mobile media applications through the innovative exploitation of active sensing and query formulation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shirmohammadi:2012:ISS, author = "Shervin Shirmohammadi and Mohamed Hefeeda and Wei Tsang Ooi and Romulus Grigoras", title = "Introduction to special section on {$3$D} mobile multimedia", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "41:1--41:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348820", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2012:QOV, author = "Yanwei Liu and Song Ci and Hui Tang and Yun Ye and Jinxia Liu", title = "{QoE}-oriented {$3$D} video transcoding for mobile streaming", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "42:1--42:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348821", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With advance in mobile 3D display, mobile 3D video is already enabled by the wireless multimedia networking, and it will be gradually popular since it can make people enjoy the natural 3D experience anywhere and anytime. In current stage, mobile 3D video is generally delivered over the heterogeneous network combined by wired and wireless channels. How to guarantee the optimal 3D visual quality of experience (QoE) for the mobile 3D video streaming is one of the important topics concerned by the service provider. In this article, we propose a QoE-oriented transcoding approach to enhance the quality of mobile 3D video service. By learning the pre-controlled QoE patterns of 3D contents, the proposed 3D visual QoE inferring model can be utilized to regulate the transcoding configurations in real-time according to the feedbacks of network and user-end device information. In the learning stage, we propose a piecewise linear mean opinion score (MOS) interpolation method to further reduce the cumbersome manual work of preparing QoE patterns. Experimental results show that the proposed transcoding approach can provide the adapted 3D stream to the heterogeneous network, and further provide superior QoE performance to the fixed quantization parameter (QP) transcoding and mean squared error (MSE) optimized transcoding for mobile 3D video streaming.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2012:NVT, author = "Shujie Liu and Chang Wen Chen", title = "A novel {$3$D} video transcoding scheme for adaptive {$3$D} video transmission to heterogeneous terminals", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "43:1--43:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348822", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Three-dimensional video (3DV) is attracting many interests with its enhanced viewing experience and more user driven features. 3DV has several unique characteristics different from 2D video: (1) It has a much larger amount of data captured and compressed, and corresponding video compression techniques can be much more complicated in order to explore data redundancy. This will lead to more constraints on users' network access and computational capability, (2) Most users only need part of the 3DV data at any given time, while the users' requirements exhibit large diversity, (3) Only a limited number of views are captured and transmitted for 3DV. View rendering is thus necessary to generate virtual views based on the received 3DV data. However, many terminal devices do not have the functionality to generate virtual views. To enable 3DV experience for the majority of users with limited capabilities, adaptive 3DV transmission is necessary to extract/generate the required data content and represent it with supported formats and bitrates for heterogeneous terminal devices. 3DV transcoding is an emerging and effective technique to achieve desired adaptive 3DV transmission. In this article, we propose the first efficient 3DV transcoding scheme that can obtain any desired view, either an encoded one or a virtual one, and compress it with more universal H.264/AVC. The key idea of the proposed scheme is to appropriately utilize motion information contained in the bitstream to generate candidate motion information. Original information of both the desired view and reference views are used to obtain this candidate information and a proper motion refinement process is carried out for certain blocks. Simulation results show that, compared to the straightforward cascade algorithm, the proposed scheme is able to output compressed bitstream of the required view with significantly reduced complexity while incurring negligible performance loss. Such a 3DV transcoding can be applied to most gateways that usually have constraints on computational complexity and time delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Roodaki:2012:NMD, author = "Hoda Roodaki and Mahmoud Reza Hashemi and Shervin Shirmohammadi", title = "A new methodology to derive objective quality assessment metrics for scalable multiview {$3$D} video coding", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "44:1--44:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348823", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the growing demand for 3D video, efforts are underway to incorporate it in the next generation of broadcast and streaming applications and standards. 3D video is currently available in games, entertainment, education, security, and surveillance applications. A typical scenario for multiview 3D consists of several 3D video sequences captured simultaneously from the same scene with the help of multiple cameras from different positions and through different angles. Multiview video coding provides a compact representation of these multiple views by exploiting the large amount of inter-view statistical dependencies. One of the major challenges in this field is how to transmit the large amount of data of a multiview sequence over error prone channels to heterogeneous mobile devices with different bandwidth, resolution, and processing/battery power, while maintaining a high visual quality. Scalable Multiview 3D Video Coding (SMVC) is one of the methods to address this challenge; however, the evaluation of the overall visual quality of the resulting scaled-down video requires a new objective perceptual quality measure specifically designed for scalable multiview 3D video. Although several subjective and objective quality assessment methods have been proposed for multiview 3D sequences, no comparable attempt has been made for quality assessment of scalable multiview 3D video. In this article, we propose a new methodology to build suitable objective quality assessment metrics for different scalable modalities in multiview 3D video. Our proposed methodology considers the importance of each layer and its content as a quality of experience factor in the overall quality. Furthermore, in addition to the quality of each layer, the concept of disparity between layers (inter-layer disparity) and disparity between the units of each layer (intra-layer disparity) is considered as an effective feature to evaluate overall perceived quality more accurately. Simulation results indicate that by using this methodology, more efficient objective quality assessment metrics can be introduced for each multiview 3D video scalable modalities.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hamza:2012:EEM, author = "Ahmed Hamza and Mohamed Hefeeda", title = "Energy-efficient multicasting of multiview {$3$D} videos to mobile devices", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "45:1--45:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348824", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multicasting multiple video streams over wireless broadband access networks enables the delivery of multimedia content to large-scale user communities in a cost-efficient manner. Three dimensional (3D) videos are the next natural step in the evolution of digital media technologies. In order to provide 3D perception, 3D video streams contain one or more views that greatly increase their bandwidth requirements. Due to the limited channel capacity and variable bit rate of the videos, multicasting multiple 3D videos over wireless broadband networks is a challenging problem. In this article, we consider a 4G wireless access network in which a number of 3D videos represented in two-view plus depth format and encoded using scalable video coders are multicast. We formulate the optimal 3D video multicasting problem to maximize the quality of rendered virtual views on the receivers' displays. We show that this problem is NP-complete and present a polynomial time approximation algorithm to solve it. We then extend the proposed algorithm to efficiently schedule the transmission of the chosen substreams from each video in order to maximize the power saving on the mobile receivers. Our simulation-based experimental results show that our algorithm provides solutions that are within 0.3 dB of the optimal solutions while satisfying real-time requirements of multicast systems. In addition, our algorithm results in an average power consumption reduction of 86\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shi:2012:RTR, author = "Shu Shi and Klara Nahrstedt and Roy Campbell", title = "A real-time remote rendering system for interactive mobile graphics", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "46:1--46:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mobile devices are gradually changing people's computing behaviors. However, due to the limitations of physical size and power consumption, they are not capable of delivering a 3D graphics rendering experience comparable to desktops. Many applications with intensive graphics rendering workloads are unable to run on mobile platforms directly. This issue can be addressed with the idea of remote rendering: the heavy 3D graphics rendering computation runs on a powerful server and the rendering results are transmitted to the mobile client for display. However, the simple remote rendering solution inevitably suffers from the large interaction latency caused by wireless networks, and is not acceptable for many applications that have very strict latency requirements. In this article, we present an advanced low-latency remote rendering system that assists mobile devices to render interactive 3D graphics in real-time. Our design takes advantage of an image based rendering technique: 3D image warping, to synthesize the mobile display from the depth images generated on the server. The research indicates that the system can successfully reduce the interaction latency while maintaining the high rendering quality by generating multiple depth images at the carefully selected viewpoints. We study the problem of viewpoint selection, propose a real-time reference viewpoint prediction algorithm, and evaluate the algorithm performance with real-device experiments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Guan:2012:EMM, author = "Wei Guan and Suya You and Ulrich Newmann", title = "Efficient matchings and mobile augmented reality", journal = j-TOMCCAP, volume = "8", number = "3s", pages = "47:1--47:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348816.2348826", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Nov 6 18:13:07 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the fast-growing popularity of smart phones in recent years, augmented reality (AR) on mobile devices is gaining more attention and becomes more demanding than ever before. However, the limited processors in mobile devices are not quite promising for AR applications that require real-time processing speed. The challenge exists due to the fact that, while fast features are usually not robust enough in matchings, robust features like SIFT or SURF are not computationally efficient. There is always a tradeoff between robustness and efficiency and it seems that we have to sacrifice one for the other. While this is true for most existing features, researchers have been working on designing new features with both robustness and efficiency. In this article, we are not trying to present a completely new feature. Instead, we propose an efficient matching method for robust features. An adaptive scoring scheme and a more distinctive descriptor are also proposed for performance improvements. Besides, we have developed an outdoor augmented reality system that is based on our proposed methods. The system demonstrates that not only it can achieve robust matchings efficiently, it is also capable to handle large occlusions such as passengers and moving vehicles, which is another challenge for many AR applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{TOMCCAP-STAFF:2012:TCO, author = "{TOMCCAP-STAFF}", title = "Table of contents: Online supplement volume 8, number 2s, online supplement volume 8, number 3s", journal = j-TOMCCAP, volume = "8", number = "4", pages = "48:1--48:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2382432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2012:E, author = "Ralf Steinmetz", title = "Editorial", journal = j-TOMCCAP, volume = "8", number = "4", pages = "49:1--49:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379791", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2012:LRC, author = "Xiaobai Liu and Shuicheng Yan and Bin Cheng and Jinhui Tang and Tat-Sheng Chua and Hai Jin", title = "Label-to-region with continuity-biased bi-layer sparsity priors", journal = j-TOMCCAP, volume = "8", number = "4", pages = "50:1--50:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379792", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this work, we investigate how to reassign the fully annotated labels at image level to those contextually derived semantic regions, namely Label-to-Region (L2R), in a collective manner. Given a set of input images with label annotations, the basic idea of our approach to L2R is to first discover the patch correspondence across images, and then propagate the common labels shared in image pairs to these correlated patches. Specially, our approach consists of following aspects. First, each of the input images is encoded as a Bag-of-Hierarchical-Patch (BOP) for capturing the rich cues at variant scales, and the individual patches are expressed by patch-level feature descriptors. Second, we present a sparse representation formulation for discovering how well an image or a semantic region can be robustly reconstructed by all the other image patches from the input image set. The underlying philosophy of our formulation is that an image region can be sparsely reconstructed with the image patches belonging to the other images with common labels, while the robustness in label propagation across images requires that these selected patches come from very few images. This preference of being sparse at both patch and image level is named bi-layer sparsity prior. Meanwhile, we enforce the preference of choosing larger-size patches in reconstruction, referred to as continuity-biased prior in this work, which may further enhance the reliability of L2R assignment. Finally, we harness the reconstruction coefficients to propagate the image labels to the matched patches, and fuse the propagation results over all patches to finalize the L2R task. As a by-product, the proposed continuity-biased bi-layer sparse representation formulation can be naturally applied to perform image annotation on new testing images. Extensive experiments on three public image datasets clearly demonstrate the effectiveness of our proposed framework in both L2R assignment and image annotation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rooij:2012:ETS, author = "Ork De Rooij and Marcel Worring", title = "Efficient targeted search using a focus and context video browser", journal = j-TOMCCAP, volume = "8", number = "4", pages = "51:1--51:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379793", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Currently there are several interactive content-based video retrieval techniques and systems available. However, retrieval performance depends heavily on the means of interaction. We argue that effective CBVR requires efficient, specialized user interfaces. In this article we propose guidelines for such an interface, and we propose an effective CBVR engine: the ForkBrowser, which builds upon the principle of focus and context. This browser is evaluated using a combination of user simulation and real user evaluation. Results indicate that the ideas have merit, and that the browser performs very well when compared to the state-of-the-art in video retrieval.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghinea:2012:UPM, author = "Gheorghita Ghinea and Oluwakemi Ademoye", title = "User perception of media content association in olfaction-enhanced multimedia", journal = j-TOMCCAP, volume = "8", number = "4", pages = "52:1--52:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379794", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Olfaction is an exciting challenge facing multimedia applications. In this article we have investigated user perception of the association between olfactory media content and video media content in olfactory-enhanced multimedia. Results show that the association between scent and content has a significant impact on the user-perceived experience of olfactory-enhanced multimedia.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Spicer:2012:NAD, author = "Ryan Spicer and Yu-Ru Lin and Aisling Kelliher and Hari Sundaram", title = "{NextSlidePlease}: Authoring and delivering agile multimedia presentations", journal = j-TOMCCAP, volume = "8", number = "4", pages = "53:1--53:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379795", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Presentation support tools, such as Microsoft PowerPoint, pose challenges both in terms of creating linear presentations from complex data and fluidly navigating such linear structures when presenting to diverse audiences. NextSlidePlease is a slideware application that addresses these challenges using a directed graph structure approach for authoring and delivering multimedia presentations. The application combines novel approaches for searching and analyzing presentation datasets, composing meaningfully structured presentations, and efficiently delivering material under a variety of time constraints. We introduce and evaluate a presentation analysis algorithm intended to simplify the process of authoring dynamic presentations, and a time management and path selection algorithm that assists users in prioritizing content during the presentation process. Results from two comparative user studies indicate that the directed graph approach promotes the creation of hyperlinks, the consideration of connections between content items, and a richer understanding of the time management consequences of including and selecting presentation material.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Qi:2012:OBI, author = "Heng Qi and Keqiu Li and Yanming Shen and Wenyu Qu", title = "Object-based image retrieval with kernel on adjacency matrix and local combined features", journal = j-TOMCCAP, volume = "8", number = "4", pages = "54:1--54:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379796", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In object-based image retrieval, there are two important issues: an effective image representation method for representing image content and an effective image classification method for processing user feedback to find more images containing the user-desired object categories. In the image representation method, the local-based representation is the best selection for object-based image retrieval. As a kernel-based classification method, Support Vector Machine (SVM) has shown impressive performance on image classification. But SVM cannot work on the local-based representation unless there is an appropriate kernel. To address this problem, some representative kernels are proposed in literatures. However, these kernels cannot work effectively in object-based image retrieval due to ignoring the spatial context and the combination of local features. In this article, we present Adjacent Matrix (AM) and the Local Combined Features (LCF) to incorporate the spatial context and the combination of local features into the kernel. We propose the AM-LCF feature vector to represent image content and the AM-LCF kernel to measure the similarities between AM-LCF feature vectors. According to the detailed analysis, we show that the proposed kernel can overcome the deficiencies of existing kernels. Moreover, we evaluate the proposed kernel through experiments of object-based image retrieval on two public image sets. The experimental results show that the performance of object-based image retrieval can be improved by the proposed kernel.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2012:VPA, author = "Guangda Li and Meng Wang and Zheng Lu and Richang Hong and Tat-Seng Chua", title = "In-video product annotation with {Web} information mining", journal = j-TOMCCAP, volume = "8", number = "4", pages = "55:1--55:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379797", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Product annotation in videos is of great importance for video browsing, search, and advertisement. However, most of the existing automatic video annotation research focuses on the annotation of high-level concepts, such as events, scenes, and object categories. This article presents a novel solution to the annotation of specific products in videos by mining information from the Web. It collects a set of high-quality training data for each product by simultaneously leveraging Amazon and Google image search engine. A visual signature for each product is then built based on the bag-of-visual-words representation of the training images. A correlative sparsification approach is employed to remove noisy bins in the visual signatures. These signatures are used to annotate video frames. We conduct experiments on more than 1,000 videos and the results demonstrate the feasibility and effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gopinathan:2012:ASO, author = "Ajay Gopinathan and Zongpeng Li", title = "Algorithms for stochastic optimization of multicast content delivery with network coding", journal = j-TOMCCAP, volume = "8", number = "4", pages = "56:1--56:??", month = nov, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2379790.2379798", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:21 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The usage of network resources by content providers is commonly governed by Service-Level Agreements (SLA) between the content provider and the network service provider. Resource usage exceeding the limits specified in the SLA incurs the content provider additional charges, usually at a higher cost. Hence, the content provider's goal is to provision adequate resources in the SLA based on forecasts of future demand. We study capacity purchasing strategies when the content provider employs network coded multicast as the media delivery mechanism, with uncertainty in its future customer set explicitly taken into consideration. The latter requires the content provider to make capacity provisioning decisions based on market predictions and historical customer usage patterns. The probabilistic element suggests a stochastic optimization approach. We model this problem as a two-stage stochastic optimization problem with recourse. Such optimizations are \#P-hard to solve directly, and we design two approximation algorithms for them. The first is a heuristic algorithm that exploits properties unique to network coding, so that only polynomial-time operations are needed. It performs well in general scenarios, but the gap from the optimal solution is not bounded by any constant in the worst case. This motivates our second approach, a sampling algorithm partly inspired from the work of Gupta et al. [2004a]. We employ techniques from duality theory in linear optimization to prove that the sampling algorithm provides a 3-approximation to the stochastic multicast problem. We conduct extensive simulations to illustrate the efficacy of both algorithms, and show that the performance of both is usually within 10\% of the optimal solution in practice.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hendrikx:2013:PCG, author = "Mark Hendrikx and Sebastiaan Meijer and Joeri {Van Der Velden} and Alexandru Iosup", title = "Procedural content generation for games: a survey", journal = j-TOMCCAP, volume = "9", number = "1", pages = "1:1--1:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422957", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hundreds of millions of people play computer games every day. For them, game content-from 3D objects to abstract puzzles-plays a major entertainment role. Manual labor has so far ensured that the quality and quantity of game content matched the demands of the playing community, but is facing new scalability challenges due to the exponential growth over the last decade of both the gamer population and the production costs. Procedural Content Generation for Games (PCG-G) may address these challenges by automating, or aiding in, game content generation. PCG-G is difficult, since the generator has to create the content, satisfy constraints imposed by the artist, and return interesting instances for gamers. Despite a large body of research focusing on PCG-G, particularly over the past decade, ours is the first comprehensive survey of the field of PCG-G. We first introduce a comprehensive, six-layered taxonomy of game content: bits, space, systems, scenarios, design, and derived. Second, we survey the methods used across the whole field of PCG-G from a large research body. Third, we map PCG-G methods to game content layers; it turns out that many of the methods used to generate game content from one layer can be used to generate content from another. We also survey the use of methods in practice, that is, in commercial or prototype games. Fourth and last, we discuss several directions for future research in PCG-G, which we believe deserve close attention in the near future.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2013:IRQ, author = "Dong Liu and Shuicheng Yan and Rong-Rong Ji and Xian-Sheng Hua and Hong-Jiang Zhang", title = "Image retrieval with query-adaptive hashing", journal = j-TOMCCAP, volume = "9", number = "1", pages = "2:1--2:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422958", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hashing-based approximate nearest-neighbor search may well realize scalable content-based image retrieval. The existing semantic-preserving hashing methods leverage the labeled data to learn a fixed set of semantic-aware hash functions. However, a fixed hash function set is unable to well encode all semantic information simultaneously, and ignores the specific user's search intention conveyed by the query. In this article, we propose a query-adaptive hashing method which is able to generate the most appropriate binary codes for different queries. Specifically, a set of semantic-biased discriminant projection matrices are first learnt for each of the semantic concepts, through which a semantic-adaptable hash function set is learnt via a joint sparsity variable selection model. At query time, we further use the sparsity representation procedure to select the most appropriate hash function subset that is informative to the semantic information conveyed by the query. Extensive experiments over three benchmark image datasets well demonstrate the superiority of our proposed query-adaptive hashing method over the state-of-the-art ones in terms of retrieval accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zheng:2013:GSD, author = "Yan-Tao Zheng and Shuicheng Yan and Zheng-Jun Zha and Yiqun Li and Xiangdong Zhou and Tat-Seng Chua and Ramesh Jain", title = "{GPSView}: a scenic driving route planner", journal = j-TOMCCAP, volume = "9", number = "1", pages = "3:1--3:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422959", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "GPS devices have been widely used in automobiles to compute navigation routes to destinations. The generated driving route targets the minimal traveling distance, but neglects the sightseeing experience of the route. In this study, we propose an augmented GPS navigation system, GPSView, to incorporate a scenic factor into the routing. The goal of GPSView is to plan a driving route with scenery and sightseeing qualities, and therefore allow travelers to enjoy sightseeing on the drive. To do so, we first build a database of scenic roadways with vistas of landscapes and sights along the roadside. Specifically, we adapt an attention-based approach to exploit community-contributed GPS-tagged photos on the Internet to discover scenic roadways. The premise is: a multitude of photos taken along a roadway imply that this roadway is probably appealing and catches the public's attention. By analyzing the geospatial distribution of photos, the proposed approach discovers the roadside sight spots, or Points-Of-Interest (POIs), which have good scenic qualities and visibility to travelers on the roadway. Finally, we formulate scenic driving route planning as an optimization task towards the best trade-off between sightseeing experience and traveling distance. Testing in the northern California area shows that the proposed system can deliver promising results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhou:2013:SMV, author = "Wengang Zhou and Houqiang Li and Yijuan Lu and Qi Tian", title = "{SIFT} match verification by geometric coding for large-scale partial-duplicate web image search", journal = j-TOMCCAP, volume = "9", number = "1", pages = "4:1--4:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422960", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Most large-scale image retrieval systems are based on the bag-of-visual-words model. However, the traditional bag-of-visual-words model does not capture the geometric context among local features in images well, which plays an important role in image retrieval. In order to fully explore geometric context of all visual words in images, efficient global geometric verification methods have been attracting lots of attention. Unfortunately, current existing methods on global geometric verification are either computationally expensive to ensure real-time response, or cannot handle rotation well. To solve the preceding problems, in this article, we propose a novel geometric coding algorithm, to encode the spatial context among local features for large-scale partial-duplicate Web image retrieval. Our geometric coding consists of geometric square coding and geometric fan coding, which describe the spatial relationships of SIFT features into three geo-maps for global verification to remove geometrically inconsistent SIFT matches. Our approach is not only computationally efficient, but also effective in detecting partial-duplicate images with rotation, scale changes, partial-occlusion, and background clutter. Experiments in partial-duplicate Web image search, using two datasets with one million Web images as distractors, reveal that our approach outperforms the baseline bag-of-visual-words approach even following a RANSAC verification in mean average precision. Besides, our approach achieves comparable performance to other state-of-the-art global geometric verification methods, for example, spatial coding scheme, but is more computationally efficient.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Park:2013:ISL, author = "Jong-Seung Park and Ramesh Jain", title = "Identification of scene locations from geotagged images", journal = j-TOMCCAP, volume = "9", number = "1", pages = "5:1--5:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422961", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Due to geotagging capabilities of consumer cameras, it has become easy to capture the exact geometric location where a picture is taken. However, the location is not the whereabouts of the scene taken by the photographer but the whereabouts of the photographer himself. To determine the actual location of an object seen in a photo some sophisticated and tiresome steps are required on a special camera rig, which are generally not available in common digital cameras. This article proposes a novel method to determine the geometric location corresponding to a specific image pixel. A new technique of stereo triangulation is introduced to compute the relative depth of a pixel position. Geographical metadata embedded in images are utilized to convert relative depths to absolute coordinates. When a geographic database is available we can also infer the semantically meaningful description of a scene object from where the specified pixel is projected onto the photo. Experimental results demonstrate the effectiveness of the proposed approach in accurately identifying actual locations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2013:RAA, author = "Yichuan Wang and Ting-An Lin and Cheng-Hsin Hsu and Xin Liu", title = "Region- and action-aware virtual world clients", journal = j-TOMCCAP, volume = "9", number = "1", pages = "6:1--6:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422962", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We propose region- and action-aware virtual world clients. To develop such clients, we present a parameterized network traffic model, based on a large collection of Second Life traces gathered by us. Our methodology is also applicable to virtual worlds other than Second Life. With the traffic model, various optimization criteria can be adopted, including visual quality, response time, and energy consumption. We use energy consumption as the show case, and demonstrate via trace-driven simulations that, compared to two existing schemes, a mobile client can save up to 36\% and 41\% communication energy by selectively turning on its WiFi network interface.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Khodabakhshi:2013:SSF, author = "Naghmeh Khodabakhshi and Mohamed Hefeeda", title = "{Spider}: a system for finding {$3$D} video copies", journal = j-TOMCCAP, volume = "9", number = "1", pages = "7:1--7:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422963", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel content-based copy detection system for 3D videos. The system creates compact and robust depth and visual signatures from the 3D videos. Then, signature of a query video is compared against an indexed database of reference videos' signatures. The system returns a score, using both spatial and temporal characteristics of videos, indicating whether the query video matches any video in the reference video database, and in case of matching, which portion of the reference video matches the query video. Analysis shows that the system is efficient, both computationally and storage-wise. The system can be used, for example, by video content owners, video hosting sites, and third-party companies to find illegally copied 3D videos. We implemented Spider, a complete realization of the proposed system, and conducted rigorous experiments on it. Our experimental results show that the proposed system can achieve high accuracy in terms of precision and recall even if the 3D videos are subjected to several transformations at the same time. For example, the proposed system yields 100\% precision and recall when copied videos are parts of original videos, and more than 90\% precision and recall when copied videos are subjected to different individual transformations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Abrams:2013:WAG, author = "Austin Abrams and Robert Pless", title = "{Web}-accessible geographic integration and calibration of webcams", journal = j-TOMCCAP, volume = "9", number = "1", pages = "8:1--8:??", month = feb, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2422956.2422964", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun May 5 09:14:22 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A global network of webcams offers unique viewpoints from tens of thousands of locations. Understanding the geographic context of this imagery is vital in using these cameras for quantitative environmental monitoring or surveillance applications. We derive robust geo-calibration constraints that allow users to geo-register static or pan-tilt-zoom cameras by specifying a few corresponding points, and describe our Web interface suitable for novices. We discuss design decisions that support our scalable, publicly accessible Web service that allows webcam textures to be displayed live on 3D geographic models. Finally, we demonstrate several multimedia applications for geo-calibrated cameras.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2013:EN, author = "Ralf Steinmetz", title = "Editorial note", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "31:1--31:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523001.2523002", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Nahrstedt:2013:ISS, author = "Klara Nahrstedt and Rainer Lienhart and Malcolm Slaney", title = "Introduction to the special section on the 20th anniversary of the {ACM International Conference on Multimedia}", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "32:1--32:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523001.2523003", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2013:TDI, author = "Baochun Li and Zhi Wang and Jiangchuan Liu and Wenwu Zhu", title = "Two decades of {Internet} video streaming: a retrospective view", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "33:1--33:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2505805", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For over two decades, video streaming over the Internet has received a substantial amount of attention from both academia and industry. Starting from the design of transport protocols for streaming video, research interests have later shifted to the peer-to-peer paradigm of designing streaming protocols at the application layer. More recent research has focused on building more practical and scalable systems, using Dynamic Adaptive Streaming over HTTP. In this article, we provide a retrospective view of the research results over the past two decades, with a focus on peer-to-peer streaming protocols and the effects of cloud computing and social media.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2013:ETM, author = "Zixia Huang and Klara Nahrstedt and Ralf Steinmetz", title = "Evolution of temporal multimedia synchronization principles: a historical viewpoint", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "34:1--34:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490821", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The evolution of multimedia applications has drastically changed human life and behaviors. New communication technologies lead to new requirements for multimedia synchronization. This article presents a historical view of temporal synchronization studies focusing on continuous multimedia. We demonstrate how the development of multimedia systems has created new challenges for synchronization technologies. We conclude with a new application-dependent, multilocation, multirequirement synchronization framework to address these new challenges.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bulterman:2013:SAM, author = "Dick C. A. Bulterman and Pablo Cesar and Rodrigo Laiola Guimar{\~a}es", title = "Socially-aware multimedia authoring: {Past}, present, and future", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "35:1--35:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2491893", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Creating compelling multimedia productions is a nontrivial task. This is as true for creating professional content as it is for nonprofessional editors. During the past 20 years, authoring networked content has been a part of the research agenda of the multimedia community. Unfortunately, authoring has been seen as an initial enterprise that occurs before `real' content processing takes place. This limits the options open to authors and to viewers of rich multimedia content for creating and receiving focused, highly personal media presentations. This article reflects on the history of multimedia authoring. We focus on the particular task of supporting socially-aware multimedia, in which the relationships within particular social groups among authors and viewers can be exploited to create highly personal media experiences. We provide an overview of the requirements and characteristics of socially-aware multimedia authoring within the context of exploiting community content. We continue with a short historical perspective on authoring support for these types of situations. We then present an overview of a current system for supporting socially-aware multimedia authoring within the community content. We conclude with a discussion of the issues that we feel can provide a fruitful basis for future multimedia authoring support. We argue that providing support for socially-aware multimedia authoring can have a profound impact on the nature and architecture of the entire multimedia information processing pipeline.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2013:IST, author = "Lei Zhang and Yong Rui", title = "Image search-from thousands to billions in 20 years", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "36:1--36:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490823", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a comprehensive review and analysis on image search in the past 20 years, emphasizing the challenges and opportunities brought by the astonishing increase of dataset scales from thousands to billions in the same time period, which was witnessed first-hand by the authors as active participants in this research area. Starting with a retrospective review of three stages of image search in the history, the article highlights major breakthroughs around the year 2000 in image search features, indexing methods, and commercial systems, which marked the transition from stage two to stage three. Subsequent sections describe the image search research from four important aspects: system framework, feature extraction and image representation, indexing, and big data's potential. Based on the review, the concluding section discusses open research challenges and suggests future research directions in effective visual representation, image knowledge base construction, implicit user feedback and crowdsourcing, mobile image search, and creative multimedia interfaces.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rowe:2013:LFY, author = "Lawrence A. Rowe", title = "Looking forward 10 years to multimedia successes", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "37:1--37:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A panel at ACM Multimedia 2012 addressed research successes in the past 20 years. While the panel focused on the past, this article discusses successes since the ACM SIGMM 2003 Retreat and suggests research directions in the next ten years. While significant progress has been made, more research is required to allow multimedia to impact our everyday computing environment. The importance of hardware changes on future research directions is discussed. We believe ubiquitous computing-meaning abundant computation and network bandwidth-should be applied in novel ways to solve multimedia grand challenges and continue the IT revolution of the past century.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shenoy:2013:MSR, author = "Prashant Shenoy", title = "Multimedia systems research: {The} first twenty years and lessons for the next twenty", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "38:1--38:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490859", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This retrospective article examines the past two decades of multimedia systems research through the lens of three research topics that were in vogue in the early days of the field and offers perspectives on the evolution of these research topics. We discuss the eventual impact of each line of research and offer lessons for future research in the field.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hua:2013:OVD, author = "Kien A. Hua", title = "Online video delivery: {Past}, present, and future", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "39:1--39:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502435", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video streaming is the core technology for online video delivery systems. Initial research on this technology faced many challenges. In this article, lessons learned from beginning trials are discussed; some pioneering works that provided early solutions and inspired subsequent research are presented; and new techniques required for emerging applications are examined.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Swaminathan:2013:WMV, author = "Viswanathan Swaminathan", title = "Are we in the middle of a video streaming revolution?", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "40:1--40:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490826", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "It has been roughly 20 years since the beginning of video streaming over the Internet. Until very recently, video streaming experiences left much to be desired. Over the last few years, this has significantly improved making monetization of streaming, possible. Recently, there has been an explosion of commercial video delivery services over the Internet, sometimes referred to as over-the-top (OTT) delivery. All these services invariably use streaming technologies. Initially, streaming had all the promise, then for a long time, it was download and play, later progressive download for short content, and now it is streaming again. Did streaming win the download versus streaming contest? Did the best technology win? The improvement in streaming experience has been possible through a variety of new streaming technologies, some proprietary and others extensions to standard protocols. The primary delivery mechanism for entertainment video, both premium content like movies and user generated content (UGC), tends to be HTTP streaming. Is HTTP streaming the panacea for all problems? The goal of this article is to give an industry perspective of what fundamentally changed in video streaming that makes it commercially viable now. This article outlines how a blend of technology choices between download and streaming makes the current wave of ubiquitous streaming possible for entertainment video delivery. After identifying problems that still need to be solved, the article concludes with the lessons learnt from the video streaming evolution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chou:2013:AIC, author = "Philip A. Chou", title = "Advances in immersive communication: (1) {Telephone}, (2) {Television}, (3) {Teleportation}", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "41:1--41:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492704", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The last great advances in immersive communication were the invention of the telephone over 137 years ago and the invention of the video telephone (n{\'e} television) over 86 years ago. However, a perfect storm is brewing for the next advance in immersive communication, thanks to the convergence of massive amounts of computation, bandwidth, resolution, new sensors, and new displays. It could well be the Multimedia community that turns this brew into the next great advance in immersive communication, something akin to teleportation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chang:2013:HFW, author = "Shih-Fu Chang", title = "How far we've come: {Impact} of 20 years of multimedia information retrieval", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "42:1--42:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2491844", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article reviews the major research trends that emerged in the last two decades within the broad area of multimedia information retrieval, with a focus on the ACM Multimedia community. Trends are defined (nonscientifically) to be topics that appeared in ACM multimedia publications and have had a significant number of citations. The article also assesses the impacts of these trends on real-world applications. The views expressed are subjective and likely biased but hopefully useful for understanding the heritage of the community and stimulating new research direction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Effelsberg:2013:PLB, author = "Wolfgang Effelsberg", title = "A personal look back at twenty years of research in multimedia content analysis", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "43:1--43:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502434", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This paper is a personal look back at twenty years of research in multimedia content analysis. It addresses the areas of audio, photo and video analysis for the purpose of indexing and retrieval from the perspective of a multimedia researcher. Whereas a general analysis of content is impossible due to the personal bias of the user, significant progress was made in the recognition of specific objects or events. The paper concludes with a brief outlook on the future.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hanjalic:2013:MRM, author = "Alan Hanjalic", title = "Multimedia retrieval that matters", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "44:1--44:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490827", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article emphasizes the need to refocus multimedia information retrieval (MIR) research towards bridging the utility gap, the gap between the expected and defacto usefulness of MIR solutions. This requires us to revisit the notion of relevance, but also to consider other criteria for assessing MIR solutions, like the informativeness of the retrieved results and how helpful they are for the users. The article also states that this focus shift cannot be realized incrementally, but by revisiting the foundations of MIR solutions, that is, by a utility-by-design approach. In this respect, a number of research challenges are proposed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Turk:2013:TYE, author = "Matthew Turk", title = "Over twenty years of eigenfaces", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "45:1--45:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2490824", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The inaugural ACM Multimedia Conference coincided with a surge of interest in computer vision technologies for detecting and recognizing people and their activities in images and video. Face recognition was the first of these topics to broadly engage the vision and multimedia research communities. The Eigenfaces approach was, deservedly or not, the method that captured much of the initial attention, and it continues to be taught and used as a benchmark over 20 years later. This article is a brief personal view of the genesis of Eigenfaces for face recognition and its relevance to the multimedia community.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Whitman:2013:CSF, author = "Brian Whitman", title = "Care and scale: {Fifteen} years of music retrieval", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "46:1--46:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492703", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The co-founder of The Echo Nest, a music intelligence company that now powers recommendation and discovery for most music services, discusses the notion of care and scale, cultural analysis of music, a brief history of music retrieval, and how and why The Echo Nest got started.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Szeliski:2013:NWC, author = "Richard Szeliski and Noah Snavely and Steven M. Seitz", title = "Navigating the worldwide community of photos", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "47:1--47:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2492208", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The last decade has seen an explosion in the number of photographs available on the Internet. The sheer volume of interesting photos makes it a challenge to explore this space. Various Web and social media sites, along with search and indexing techniques, have been developed in response. One natural way to navigate these images in a 3D geo-located context. In this article, we reflect on our work in this area, with a focus on techniques that build partial 3D scene models to help find and navigate interesting photographs in an interactive, immersive 3D setting. We also discuss how finding such relationships among photographs opens up exciting new possibilities for multimedia authoring, visualization, and editing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Andre:2013:EUU, author = "Elisabeth Andre", title = "Exploiting unconscious user signals in multimodal human-computer interaction", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "48:1--48:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents the idea of empathic stimulation that relies on the power and potential of unconsciously conveyed attentive and emotional information to facilitate human-machine interaction. Starting from a historical review of related work presented at past ACM Multimedia conferences, we discuss challenges that arise when exploiting unconscious human signals for empathic stimulation, such as the real-time analysis of psychological user states and the smooth adaptation of the human-machine interface based on this analysis. A classical application field that might benefit from the idea of unconscious human-computer interaction is the exploration of massive datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sundaram:2013:EMS, author = "Hari Sundaram", title = "Experiential media systems", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "49:1--49:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a personalized narrative on the early discussions within the Multimedia community and the subsequent research on experiential media systems. I discuss two different research initiatives-design of real-time, immersive multimedia feedback environments for stroke rehabilitation; exploratory environments for events that exploited the user's ability to make connections. I discuss the issue of foundations: the question of multisensory integration and superadditivity; the need for identification of ``first-class'' Multimedia problems; expanding the scope of Multimedia research.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kompatsiaris:2013:ISS, author = "Ioannis (Yiannis) Kompatsiaris and Wenjun (Kevin) Zeng and Gang Hua and Liangliang Cao", title = "Introduction to the special section of best papers of {ACM} multimedia 2012", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "50:1--50:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523001.2523004", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2013:RAM, author = "Heng Liu and Tao Mei and Houqiang Li and Jiebo Luo and Shipeng Li", title = "Robust and accurate mobile visual localization and its applications", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "51:1--51:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2491735", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mobile applications are becoming increasingly popular. More and more people are using their phones to enjoy ubiquitous location-based services (LBS). The increasing popularity of LBS creates a fundamental problem: mobile localization. Besides traditional localization methods that use GPS or wireless signals, using phone-captured images for localization has drawn significant interest from researchers. Photos contain more scene context information than the embedded sensors, leading to a more precise location description. With the goal being to accurately sense real geographic scene contexts, this article presents a novel approach to mobile visual localization according to a given image (typically associated with a rough GPS position). The proposed approach is capable of providing a complete set of more accurate parameters about the scene geo-context including the real locations of both the mobile user and perhaps more importantly the captured scene, as well as the viewing direction. To figure out how to make image localization quick and accurate, we investigate various techniques for large-scale image retrieval and 2D-to-3D matching. Specifically, we first generate scene clusters using joint geo-visual clustering, with each scene being represented by a reconstructed 3D model from a set of images. The 3D models are then indexed using a visual vocabulary tree structure. Taking geo-tags of the database image as prior knowledge, a novel location-based codebook weighting scheme proposed to embed this additional information into the codebook. The discriminative power of the codebook is enhanced, thus leading to better image retrieval performance. The query image is aligned with the models obtained from the image retrieval results, and eventually registered to a real-world map. We evaluate the effectiveness of our approach using several large-scale datasets and achieving estimation accuracy of a user's location within 13 meters, viewing direction within 12 degrees, and viewing distance within 26 meters. Of particular note is our showcase of three novel applications based on localization results: (1) an on-the-spot tour guide, (2) collaborative routing, and (3) a sight-seeing guide. The evaluations through user studies demonstrate that these applications are effective in facilitating the ideal rendezvous for mobile users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2013:PBS, author = "Zhi Wang and Wenwu Zhu and Xiangwen Chen and Lifeng Sun and Jiangchuan Liu and Minghua Chen and Peng Cui and Shiqiang Yang", title = "Propagation-based social-aware multimedia content distribution", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "52:1--52:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523001.2523005", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online social networks have reshaped how multimedia contents are generated, distributed, and consumed on today's Internet. Given the massive number of user-generated contents shared in online social networks, users are moving to directly access these contents in their preferred social network services. It is intriguing to study the service provision of social contents for global users with satisfactory quality of experience. In this article, we conduct large-scale measurement of a real-world online social network system to study the social content propagation. We have observed important propagation patterns, including social locality, geographical locality, and temporal locality. Motivated by the measurement insights, we propose a propagation-based social-aware delivery framework using a hybrid edge-cloud and peer-assisted architecture. We also design replication strategies for the architecture based on three propagation predictors designed by jointly considering user, content, and context information. In particular, we design a propagation region predictor and a global audience predictor to guide how the edge-cloud servers backup the contents, and a local audience predictor to guide how peers cache the contents for their friends. Our trace-driven experiments further demonstrate the effectiveness and superiority of our design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sang:2013:SIA, author = "Jitao Sang and Changsheng Xu", title = "Social influence analysis and application on multimedia sharing websites", journal = j-TOMCCAP, volume = "9", number = "1s", pages = "53:1--53:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502436", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:45 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Social media is becoming popular these days, where users necessarily interact with each other to form social networks. Influence network, as one special case of social network, has been recognized as significantly impacting social activities and user decisions. We emphasize in this article that the inter-user influence is essentially topic-sensitive, as for different tasks users tend to trust different influencers and be influenced most by them. While existing research focuses on global influence modeling and applies to text-based networks, this work investigates the problem of topic-sensitive influence modeling in the multimedia domain. According to temporal data justification, we propose a multimodal probabilistic model, considering both users' textual annotation and uploaded visual images. This model is capable of simultaneously extracting user topic distributions and topic-sensitive influence strengths. By identifying the topic-sensitive influencer, we are able to conduct applications, like collective search and collaborative recommendation. A risk minimization-based general framework for personalized image search is further presented, where the image search task is transferred to measure the distance of image and personalized query language models. The framework considers the noisy tag issue and enables easy incorporation of social influence. We have conducted experiments on a large-scale Flickr dataset. Qualitative as well as quantitative evaluation results have validated the effectiveness of the topic-sensitive influencer mining model, and demonstrated the advantage of incorporating topic-sensitive influence in personalized image search and topic-based image recommendation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Silva:2013:HPH, author = "Juan M. Silva and Mauricio Orozco and Jongeun Cha and Abdulmotaleb {El Saddik} and Emil M. Petriu", title = "Human perception of haptic-to-video and haptic-to-audio skew in multimedia applications", journal = j-TOMCCAP, volume = "9", number = "2", pages = "9:1--9:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457451", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The purpose of this research is to assess the sensitivity of humans to perceive asynchrony among media signals coming from a computer application. Particularly we examine haptic-to-video and haptic-to-audio skew. For this purpose we have designed an experimental setup, where users are exposed to a basic multimedia presentation resembling a ping-pong game. For every collision between a ball and a racket, the user is able to perceive auditory, visual, and haptic cues about the collision event. We artificially introduce negative and positive delay to the auditory and visual cues with respect to the haptic stream. We subjectively evaluate the perception of inter-stream asynchrony perceived by the users using two types of haptic devices. The statistical results of our evaluation show perception rates of around 100 ms regardless of modality and type of device.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bhatt:2013:RPB, author = "Chidansh A. Bhatt and Pradeep K. Atrey and Mohan S. Kankanhalli", title = "A reward-and-punishment-based approach for concept detection using adaptive ontology rules", journal = j-TOMCCAP, volume = "9", number = "2", pages = "10:1--10:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457452", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Despite the fact that performance improvements have been reported in the last years, semantic concept detection in video remains a challenging problem. Existing concept detection techniques, with ontology rules, exploit the static correlations among primitive concepts but not the dynamic spatiotemporal correlations. The proposed method rewards (or punishes) detected primitive concepts using dynamic spatiotemporal correlations of the given ontology rules and updates these ontology rules based on the accuracy of detection. Adaptively learned ontology rules significantly help in improving the overall accuracy of concept detection as shown in the experimental result.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Alsulaiman:2013:IVB, author = "Fawaz A. Alsulaiman and Nizar Sakr and Julio J. Vald{\'e}s and Abdulmotaleb {El Saddik}", title = "Identity verification based on handwritten signatures with haptic information using genetic programming", journal = j-TOMCCAP, volume = "9", number = "2", pages = "11:1--11:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457453", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, haptic-based handwritten signature verification using Genetic Programming (GP) classification is presented. A comparison of GP-based classification with classical classifiers including support vector machine, $k$-nearest neighbors, na{\"\i}ve Bayes, and random forest is conducted. In addition, the use of GP in discovering small knowledge-preserving subsets of features in high-dimensional datasets of haptic-based signatures is investigated and several approaches are explored. Subsets of features extracted from GP-generated models (analytic functions) are also exploited to determine the importance and relevance of different haptic data types (e.g., force, position, torque, and orientation) in user identity verification. The results revealed that GP classifiers compare favorably with the classical methods and use a much fewer number of attributes (with simple function sets).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2013:MAS, author = "Qianni Zhang and Ebroul Izquierdo", title = "Multifeature analysis and semantic context learning for image classification", journal = j-TOMCCAP, volume = "9", number = "2", pages = "12:1--12:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457454", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article introduces an image classification approach in which the semantic context of images and multiple low-level visual features are jointly exploited. The context consists of a set of semantic terms defining the classes to be associated to unclassified images. Initially, a multiobjective optimization technique is used to define a multifeature fusion model for each semantic class. Then, a Bayesian learning procedure is applied to derive a context model representing relationships among semantic classes. Finally, this context model is used to infer object classes within images. Selected results from a comprehensive experimental evaluation are reported to show the effectiveness of the proposed approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2013:MEU, author = "Zhen Wei Zhao and Sameer Samarth and Wei Tsang Ooi", title = "Modeling the effect of user interactions on mesh-based {P2P VoD} streaming systems", journal = j-TOMCCAP, volume = "9", number = "2", pages = "13:1--13:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457455", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "User interactions such as seeks and pauses are widely supported by existing Peer-to-Peer Video-on-Demand (P2P VoD) streaming systems. Their effect on the streaming system, however, has not been well studied. Seeks cause peers to skip part of the video, making them stay in the system for shorter time, and thus contribute less. On the other hand, only part of the video is downloaded due to seeks, reducing peers' demand from the system. It is unclear which factor dominates the effect of seeks on the streaming system. Pauses during playback, on one hand, allow peers to stay longer in the system and upload more content. When interleaved with seeks, however, long pauses may increase peers' demand unnecessarily as peers may download content that will eventually be skipped by subsequent forward seeks. The collective effect of seeks and pauses, together with the known random peer departure, is unintuitive and needs to be addressed properly so as to understand the effect of human factors on the streaming system performance. In this article, we develop an analytical model to both qualitatively and quantitatively study the effect of seeks and pauses on mesh-based P2P VoD streaming systems, in particular, the effect on the server cost. Our model can help in understanding how human factors such as seeks and pauses affect the streaming system performance, tuning a P2P VoD system towards better system performance and stability, and providing a framework for capacity planning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2013:ETT, author = "Yang Yang and Yi Yang and Heng Tao Shen", title = "Effective transfer tagging from image to video", journal = j-TOMCCAP, volume = "9", number = "2", pages = "14:1--14:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457456", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent years have witnessed a great explosion of user-generated videos on the Web. In order to achieve an effective and efficient video search, it is critical for modern video search engines to associate videos with semantic keywords automatically. Most of the existing video tagging methods can hardly achieve reliable performance due to deficiency of training data. It is noticed that abundant well-tagged data are available in other relevant types of media (e.g., images). In this article, we propose a novel video tagging framework, termed as Cross-Media Tag Transfer (CMTT), which utilizes the abundance of well-tagged images to facilitate video tagging. Specifically, we build a ``cross-media tunnel'' to transfer knowledge from images to videos. To this end, an optimal kernel space, in which distribution distance between images and video is minimized, is found to tackle the domain-shift problem. A novel cross-media video tagging model is proposed to infer tags by exploring the intrinsic local structures of both labeled and unlabeled data, and learn reliable video classifiers. An efficient algorithm is designed to optimize the proposed model in an iterative and alternative way. Extensive experiments illustrate the superiority of our proposal compared to the state-of-the-art algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2013:AAP, author = "Zhen Wei Zhao and Wei Tsang Ooi", title = "{APRICOD}: an access-pattern-driven distributed caching middleware for fast content discovery of noncontinuous media access", journal = j-TOMCCAP, volume = "9", number = "2", pages = "15:1--15:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2457450.2457457", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:48 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Content discovery is a major source of latency in peer-to-peer (P2P) media streaming systems, especially in the presence of noncontinuous user access, such as random seek in Video-on-Demand (VoD) streaming and teleportation in a Networked Virtual Environment (NVE). After the aforementioned user interactions, streaming systems often need to initiate the content discovery process to identify where to retrieve the requested media objects. Short content lookup latency is demanded to ensure smooth user experience. Existing content discovery systems based on either a Distributed Hash Table (DHT) or gossip mechanism cannot cope with noncontinuous access efficiently due to their long lookup latency. In this work, we propose an access-pattern-driven distributed caching middleware named APRICOD, which caters for fast and scalable content discovery in peer-to-peer media streaming systems, especially when user interactions are present. APRICOD exploits correlations among media objects accessed by users, and adapts to shift in the user access pattern automatically. We first present a general APRICOD design that can be used with any existing content discovery system. We then present an implementation of APRICOD on top of Pastry, which we use to evaluate APRICOD. Our evaluation in a 1024-node system, using a Second Life trace with 5,735 users and a VoD trace with 54 users, shows that APRICOD can effectively resolve all continuous access queries with a single hop deterministically with node failure as an exception, and resolve noncontinuous access queries with a single hop with high probability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Anonymous:2013:CPM, author = "Anonymous", title = "Call for papers: {Multiple} sensorial {(MulSeMedia)} multi-modal media: {Advances} and applications", journal = j-TOMCCAP, volume = "9", number = "3", pages = "15:1--15:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2500818", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mei:2013:NLS, author = "Tao Mei and Lin-Xie Tang and Jinhui Tang and Xian-Sheng Hua", title = "Near-lossless semantic video summarization and its applications to video analysis", journal = j-TOMCCAP, volume = "9", number = "3", pages = "16:1--16:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487269", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The ever increasing volume of video content on the Web has created profound challenges for developing efficient indexing and search techniques to manage video data. Conventional techniques such as video compression and summarization strive for the two commonly conflicting goals of low storage and high visual and semantic fidelity. With the goal of balancing both video compression and summarization, this article presents a novel approach, called Near-Lossless Semantic Summarization (NLSS), to summarize a video stream with the least high-level semantic information loss by using an extremely small piece of metadata. The summary consists of compressed image and audio streams, as well as the metadata for temporal structure and motion information. Although at a very low compression rate (around $ 1 / 4 $0; of H.264 baseline, where traditional compression techniques can hardly preserve an acceptable visual fidelity), the proposed NLSS still can be applied to many video-oriented tasks, such as visualization, indexing and browsing, duplicate detection, concept detection, and so on. We evaluate the NLSS on TRECVID and other video collections, and demonstrate that it is a powerful tool for significantly reducing storage consumption, while keeping high-level semantic fidelity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ademoye:2013:IRT, author = "Oluwakemi A. Ademoye and Gheorghita Ghinea", title = "Information recall task impact in olfaction-enhanced multimedia", journal = j-TOMCCAP, volume = "9", number = "3", pages = "17:1--17:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487270", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Enhancing multimedia applications with olfactory sensations is one of the last challenges in the area. While there is evidence, both scientific and anecdotal, that olfactory cues help users in information recall tasks, there is a lack of work when the targeted information is one contained in a multimedia presentation, which is precisely the focus of this article. Accordingly, we present the results of two experimental studies. The first study measured the impact of olfactory media variation on the user's ability to perceive, synthesize, and analyze the informational content of olfactory-enhanced multimedia videos; the second study measured the impact of information content, and an information recall task in respect of user perception of the relevance, sense of reality, and acceptability of the olfactory media content, as well as the overall enjoyment of the experience. Results show that the use of olfactory media content, both pleasant and unpleasant, in multimedia displays does not significantly impact on information assimilation in a negative way. Moreover, the addition of a performance task may enhance the user's understanding of the correlation between the characteristic odor(s) and the scenario under consideration, as well as enable users to consciously learn the odors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yeh:2013:CAS, author = "Lo-Yao Yeh and Jiun-Long Huang", title = "A conditional access system with efficient key distribution and revocation for mobile pay-{TV} systems", journal = j-TOMCCAP, volume = "9", number = "3", pages = "18:1--18:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487271", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current mobile pay-TV systems have two types of Conditional Access Systems (CAS): group-key-based and public-key systems. The best feature of group-key-based systems is the ability to enjoy the broadcast nature in delivery multimedia contents, while the major advantage of public-key systems is consolidating the security foundation to withstand various attacks, such as collusion attacks. However, the problems of group-key-based systems include collusion attacks, lack of nonrepudiation, and troublesome key distribution. Even worse, the benefit of broadcast efficiency is confined to a group size of no more than 512 subscribers. For public-key systems, the poor delivery scalability is the major shortcoming because the unique private key feature is only suitable for one-to-one delivery. In this article, we introduce a scalable access control scheme to integrate the merits of broadcasting regardless of group size and sound security assurance, including fine-grained access control and collusion attack resistance. For subscriber revocation, a single message is broadcast to the other subscribers to get the updated key, thus significantly boosting subscriber revocation scalability. Due to mobile subscribers' dynamic movements, this article also analyzes the benefit of retransmission cases in our system. Through the performance evaluation and functionality comparison, the proposed scheme should be a decent candidate to enhance the security strength and transmission efficiency in a mobile pay-TV system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Naskar:2013:GTL, author = "Ruchira Naskar and Rajat Subhra Chakraborty", title = "A generalized tamper localization approach for reversible watermarking algorithms", journal = j-TOMCCAP, volume = "9", number = "3", pages = "19:1--19:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487272", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In general reversible watermarking algorithms, the convention is to reject the entire cover image at the receiver end if it fails authentication, since there is no way to detect the exact locations of tampering. This feature may be exploited by an adversary to bring about a form of DoS attack. Here we provide a solution to this problem in form of a tamper localization mechanism for reversible watermarking algorithms, which allows selective rejection of distorted cover image regions in case of authentication failure, thus avoiding rejection of the complete image. Additionally it minimizes the bandwidth requirement of the communication channel.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Doherty:2013:SSA, author = "Jonathan Doherty and Kevin Curran and Paul Mckevitt", title = "A self-similarity approach to repairing large dropouts of streamed music", journal = j-TOMCCAP, volume = "9", number = "3", pages = "20:1--20:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487273", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Enjoyment of audio has now become about flexibility and personal freedom. Digital audio content can be acquired from many sources and wireless networking allows digital media devices and associated peripherals to be unencumbered by wires. However, despite recent improvements in capacity and quality of service, wireless networks are inherently unreliable communications channels for the streaming of audio, being susceptible to the effects of range, interference, and occlusion. This time-varying reliability of wireless audio transfer introduces data corruption and loss, with unpleasant audible effects that can be profound and prolonged in duration. Traditional communications techniques for error mitigation perform poorly and in a bandwidth inefficient manner in the presence of such large-scale defects in a digital audio stream. A novel solution that can complement existing techniques takes account of the semantics and natural repetition of music. Through the use of self-similarity metadata, missing or damaged audio segments can be seamlessly replaced with similar undamaged segments that have already been successfully received. We propose a technology to generate relevant self-similarity metadata for arbitrary audio material and to utilize this metadata within a wireless audio receiver to provide sophisticated and real-time correction of large-scale errors. The primary objectives are to match the current section of a song being received with previous sections while identifying incomplete sections and determining replacements based on previously received portions of the song. This article outlines our approach to Forward Error Correction (FEC) technology that is used to ``repair'' a bursty dropout when listening to time-dependent media on a wireless network. Using self-similarity analysis on a music file, we can ``automatically'' repair the dropout with a similar portion of the music already received thereby minimizing a listener's discomfort.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ho:2013:IPC, author = "Edmond S. L. Ho and Jacky C. P. Chan and Taku Komura and Howard Leung", title = "Interactive partner control in close interactions for real-time applications", journal = j-TOMCCAP, volume = "9", number = "3", pages = "21:1--21:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487268.2487274", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:50 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a new framework for synthesizing motion of a virtual character in response to the actions performed by a user-controlled character in real time. In particular, the proposed method can handle scenes in which the characters are closely interacting with each other such as those in partner dancing and fighting. In such interactions, coordinating the virtual characters with the human player automatically is extremely difficult because the system has to predict the intention of the player character. In addition, the style variations from different users affect the accuracy in recognizing the movements of the player character when determining the responses of the virtual character. To solve these problems, our framework makes use of the spatial relationship-based representation of the body parts called interaction mesh, which has been proven effective for motion adaptation. The method is computationally efficient, enabling real-time character control for interactive applications. We demonstrate its effectiveness and versatility in synthesizing a wide variety of motions with close interactions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2013:ER, author = "Ralf Steinmetz", title = "Editorial: Reviewers", journal = j-TOMCCAP, volume = "9", number = "4", pages = "22:1--22:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501644", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sakai:2013:PPC, author = "Kazuya Sakai and Wei-Shinn Ku and Min-Te Sun and Roger Zimmermann", title = "Privacy preserving continuous multimedia streaming in {MANETs}", journal = j-TOMCCAP, volume = "9", number = "4", pages = "23:1--23:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501645", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "At present, mobile devices are prevalent with end users and continuous media streaming services in mobile ad-hoc networks (MANETs) support popular applications. It is required for applications that stream isochronous media that the network link be continuously available. In this study, we introduce two group-server scheduling schemes to improve link continuity: static group-server scheduling and dynamic group-server scheduling. With our solution, if one of the current links between a client and a server instance breaks, the client can still download the multimedia content from another scheduled server peer. In addition, we incorporate the data link layer constraints as well as privacy concerns into our protocol design. The simulation results show that the proposed schemes significantly improve the effective link duration, overall system performance, and degree of privacy in MANETs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dong:2013:RIA, author = "Jian Dong and Bin Cheng and Xiangyu Chen and Tat-Seng Chua and Shuicheng Yan and Xi Zhou", title = "Robust image annotation via simultaneous feature and sample outlier pursuit", journal = j-TOMCCAP, volume = "9", number = "4", pages = "24:1--24:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501646", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Graph-based semi-supervised image annotation has achieved great success in a variety of studies, yet it essentially and intuitively suffers from both the irrelevant/noisy features (referred to as feature outliers) and the unusual/corrupted samples (referred to as sample outliers). In this work, we investigate how to derive robust sample affinity matrix via simultaneous feature and sample outlier pursuit. This task is formulated as a Dual-outlier and Prior-driven Low-Rank Representation (DP-LRR) problem, which possesses convexity in objective function. In DP-LRR, the clean data are assumed to be self-reconstructible with low-rank coefficient matrix as in LRR; while the error matrix is decomposed as the sum of a row-wise sparse matrix and a column-wise sparse matrix, the l$_{2, 1}$ -norm minimization of which encourages the pursuit of feature and sample outliers respectively. The DP-LRR is further regularized by the priors from side information, that is, the inhomogeneous data pairs. An efficient iterative procedure based on linearized alternating direction method is presented to solve the DP-LRR problem, with closed-form solutions within each iteration. The derived low-rank reconstruction coefficient matrix is then fed into any graph based semi-supervised label propagation algorithm for image annotation, and as a by-product, the cleaned data from DP-LRR can also be utilized as a better image representation to generally boost image annotation performance. Extensive experiments on MIRFlickr, Corel30K, NUS-WIDE-LITE and NUS-WIDE databases well demonstrate the effectiveness of the proposed formulation for robust image annotation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Villanueva:2013:HMB, author = "Arantxa Villanueva and Victoria Ponz and Laura Sesma-Sanchez and Mikel Ariz and Sonia Porta and Rafael Cabeza", title = "Hybrid method based on topography for robust detection of iris center and eye corners", journal = j-TOMCCAP, volume = "9", number = "4", pages = "25:1--25:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501647", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A multistage procedure to detect eye features is presented. Multiresolution and topographic classification are used to detect the iris center. The eye corner is calculated combining valley detection and eyelid curve extraction. The algorithm is tested in the BioID database and in a proprietary database containing more than 1200 images. The results show that the suggested algorithm is robust and accurate. Regarding the iris center our method obtains the best average behavior for the BioID database compared to other available algorithms. Additional contributions are that our algorithm functions in real time and does not require complex post processing stages.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2013:ECR, author = "Bo Wang and Jinqiao Wang and Hanqing Lu", title = "Exploiting content relevance and social relevance for personalized ad recommendation on {Internet TV}", journal = j-TOMCCAP, volume = "9", number = "4", pages = "26:1--26:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501648", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "There have been not many interactions between the two dominant forms of mass communication: television and the Internet, while nowadays the appearance of Internet television makes them more closely. Different with traditional TV in a passive mode of transmission, Internet TV makes it more possible to make personalized service recommendation because of the interactivity between users and the Internet. In this article, we introduce a scheme to provide targeted ad recommendation to Internet TV users by exploiting the content relevance and social relevance. First, we annotate TV videos in terms of visual content analysis and textual analysis by aligning visual and textual information. Second, with user-user, video-video and user-video relationships, we employ Multi-Relationship based Probabilistic Matrix Factorization (MRPMF) to learn representative tags for modeling user preference. And then semantic content relevance (between product/ad and TV video) and social relevance (between product/ad and user interest) are calculated by projecting the corresponding tags into our advertising concept space. Finally, with relevancy scores we make ranking for relevant product/ads to effectively provide users personalized recommendation. The experimental results demonstrate attractiveness and effectiveness of our proposed approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Alam:2013:MHB, author = "Kazi Masudul Alam and Abu Saleh Md Mahfujur Rahman and Abdulmotaleb {El Saddik}", title = "Mobile haptic e-book system to support {$3$D} immersive reading in ubiquitous environments", journal = j-TOMCCAP, volume = "9", number = "4", pages = "27:1--27:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501649", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In order to leverage the use of various modalities such as audio-visual materials in instilling effective learning behavior we present an intuitive approach of annotation based hapto-audio-visual interaction with the traditional digital learning materials such as e-books. By integrating the home entertainment system in the user's reading experience combined with haptic interfaces we want to examine whether such augmentation of modalities influence the user's learning patterns. The proposed Haptic E--Book (HE-Book) system leverages the haptic jacket, haptic arm band as well as haptic sofa interfaces to receive haptic emotive signals wirelessly in the form of patterned vibrations of the actuators and expresses the learning material by incorporating image, video, 3D environment based augmented display in order to pave ways for intimate reading experience in the popular mobile e-book platform.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Nguyen:2013:TDA, author = "Tam V. Nguyen and Si Liu and Bingbing Ni and Jun Tan and Yong Rui and Shuicheng Yan", title = "Towards decrypting attractiveness via multi-modality cues", journal = j-TOMCCAP, volume = "9", number = "4", pages = "28:1--28:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501650", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Decrypting the secret of beauty or attractiveness has been the pursuit of artists and philosophers for centuries. To date, the computational model for attractiveness estimation has been actively explored in computer vision and multimedia community, yet with the focus mainly on facial features. In this article, we conduct a comprehensive study on female attractiveness conveyed by single/multiple modalities of cues, that is, face, dressing and/or voice, and aim to discover how different modalities individually and collectively affect the human sense of beauty. To extensively investigate the problem, we collect the Multi-Modality Beauty (M$^2$ B) dataset, which is annotated with attractiveness levels converted from manual $k$-wise ratings and semantic attributes of different modalities. Inspired by the common consensus that middle-level attribute prediction can assist higher-level computer vision tasks, we manually labeled many attributes for each modality. Next, a tri-layer Dual-supervised Feature-Attribute-Task (DFAT) network is proposed to jointly learn the attribute model and attractiveness model of single/multiple modalities. To remedy possible loss of information caused by incomplete manual attributes, we also propose a novel Latent Dual-supervised Feature-Attribute-Task (LDFAT) network, where latent attributes are combined with manual attributes to contribute to the final attractiveness estimation. The extensive experimental evaluations on the collected M$^2$ B dataset well demonstrate the effectiveness of the proposed DFAT and LDFAT networks for female attractiveness prediction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tang:2013:TOH, author = "Jinhui Tang and Qiang Chen and Meng Wang and Shuicheng Yan and Tat-Seng Chua and Ramesh Jain", title = "Towards optimizing human labeling for interactive image tagging", journal = j-TOMCCAP, volume = "9", number = "4", pages = "29:1--29:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501651", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Interactive tagging is an approach that combines human and computer to assign descriptive keywords to image contents in a semi-automatic way. It can avoid the problems in automatic tagging and pure manual tagging by achieving a compromise between tagging performance and manual cost. However, conventional research efforts on interactive tagging mainly focus on sample selection and models for tag prediction. In this work, we investigate interactive tagging from a different aspect. We introduce an interactive image tagging framework that can more fully make use of human's labeling efforts. That means, it can achieve a specified tagging performance by taking less manual labeling effort or achieve better tagging performance with a specified labeling cost. In the framework, hashing is used to enable a quick clustering of image regions and a dynamic multiscale clustering labeling strategy is proposed such that users can label a large group of similar regions each time. We also employ a tag refinement method such that several inappropriate tags can be automatically corrected. Experiments on a large dataset demonstrate the effectiveness of our approach", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Carbunar:2013:FNA, author = "Bogdan Carbunar and Rahul Potharaju and Michael Pearce and Venugopal Vasudevan and Michael Needham", title = "A framework for network aware caching for video on demand systems", journal = j-TOMCCAP, volume = "9", number = "4", pages = "30:1--30:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2501643.2501652", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:51 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", note = "See errata \cite{Carbunar:2014:EFN}.", abstract = "Video on Demand (VoD) services allow users to select and locally consume remotely stored content. We investigate the use of caching to solve the scalability issues of several existing VoD providers. We propose metrics and goals that define the requirements of a caching framework for CDNs of VoD systems. Using data logs collected from Motorola equipment from Comcast VoD deployments we show that several classic caching solutions do not satisfy the proposed goals. We address this issue by developing novel techniques for predicting future values of several metrics of interest. We rely on computed predictions to define the penalty imposed on the system, both network and caching sites, when not storing individual items. We use item penalties to devise novel caching and static content placement strategies. We use the previously mentioned data logs to validate our solutions and show that they satisfy all the defined goals.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2013:ENO, author = "Zechao Li and Jing Liu and Meng Wang and Changsheng Xu and Hanqing Lu", title = "Enhancing news organization for convenient retrieval and browsing", journal = j-TOMCCAP, volume = "10", number = "1", pages = "1:1--1:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2488732", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "To facilitate users to access news quickly and comprehensively, we design a news search and browsing system named GeoVisNews, in which the news elements of ``Where'', ``Who'', ``What'' and ``When'' are enhanced via news geo-localization, image enrichment and joint ranking, respectively. For news geo-localization, an Ordinal Correlation Consistent Matrix Factorization (OCCMF) model is proposed to maintain the relevance rankings of locations to a specific news document and simultaneously capture intra-relations among locations and documents. To visualize news, we develop a novel method to enrich news documents with appropriate web images. Specifically, multiple queries are first generated from news documents for image search, and then the appropriate images are selected from the collected web images by an intelligent fusion approach based on multiple features. Obtaining the geo-localized and image enriched news resources, we further employ a joint ranking strategy to provide relevant, timely and popular news items as the answer of user searching queries. Extensive experiments on a large-scale news dataset collected from the web demonstrate the superior performance of the proposed approaches over related methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Knees:2013:SMS, author = "Peter Knees and Markus Schedl", title = "A survey of music similarity and recommendation from music context data", journal = j-TOMCCAP, volume = "10", number = "1", pages = "2:1--2:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2542205.2542206", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this survey article, we give an overview of methods for music similarity estimation and music recommendation based on music context data. Unlike approaches that rely on music content and have been researched for almost two decades, music-context -based (or contextual ) approaches to music retrieval are a quite recent field of research within music information retrieval (MIR). Contextual data refers to all music-relevant information that is not included in the audio signal itself. In this article, we focus on contextual aspects of music primarily accessible through web technology. We discuss different sources of context-based data for individual music pieces and for music artists. We summarize various approaches for constructing similarity measures based on the collaborative or cultural knowledge incorporated into these data sources. In particular, we identify and review three main types of context-based similarity approaches: text-retrieval-based approaches (relying on web-texts, tags, or lyrics), co-occurrence-based approaches (relying on playlists, page counts, microblogs, or peer-to-peer-networks), and approaches based on user ratings or listening habits. This article elaborates the characteristics of the presented context-based measures and discusses their strengths as well as their weaknesses.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2013:DPO, author = "Yi-Liang Zhao and Qiang Chen and Shuicheng Yan and Tat-Seng Chua and Daqing Zhang", title = "Detecting profilable and overlapping communities with user-generated multimedia contents in {LBSNs}", journal = j-TOMCCAP, volume = "10", number = "1", pages = "3:1--3:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2502415", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In location-based social networks (LBSNs), users implicitly interact with each other by visiting places, issuing comments and/or uploading photos. These heterogeneous interactions convey the latent information for identifying meaningful user groups, namely social communities, which exhibit unique location-oriented characteristics. In this work, we aim to detect and profile social communities in LBSNs by representing the heterogeneous interactions with a multimodality nonuniform hypergraph. Here, the vertices of the hypergraph are users, venues, textual comments or photos and the hyperedges characterize the k -partite heterogeneous interactions such as posting certain comments or uploading certain photos while visiting certain places. We then view each detected social community as a dense subgraph within the heterogeneous hypergraph, where the user community is constructed by the vertices and edges in the dense subgraph and the profile of the community is characterized by the vertices related with venues, comments and photos and their inter-relations. We present an efficient algorithm to detect the overlapped dense subgraphs, where the profile of each social community is guaranteed to be available by constraining the minimal number of vertices in each modality. Extensive experiments on Foursquare data well validated the effectiveness of the proposed framework in terms of detecting meaningful social communities and uncovering their underlying profiles in LBSNs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bhatnagar:2013:SRI, author = "Gaurav Bhatnagar and Q. M. Jonathan Wu and Pradeep K. Atrey", title = "Secure randomized image watermarking based on singular value decomposition", journal = j-TOMCCAP, volume = "10", number = "1", pages = "4:1--4:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2542205.2542207", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, a novel logo watermarking scheme is proposed based on wavelet frame transform, singular value decomposition and automatic thresholding. The proposed scheme essentially rectifies the ambiguity problem in the SVD-based watermarking. The core idea is to randomly upscale the size of host image using reversible random extension transform followed by the embedding of logo watermark in the wavelet frame domain. After embedding, a verification phase is casted with the help of a binary watermark and toral automorphism. At the extraction end, the binary watermark is first extracted followed by the verification of watermarked image. The logo watermark is extracted if and only if the watermarked image is verified. The security, attack and comparative analysis confirm high security, efficiency and robustness of the proposed watermarking system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mou:2013:CBC, author = "Luntian Mou and Tiejun Huang and Yonghong Tian and Menglin Jiang and Wen Gao", title = "Content-based copy detection through multimodal feature representation and temporal pyramid matching", journal = j-TOMCCAP, volume = "10", number = "1", pages = "5:1--5:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2542205.2542208", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Content-based copy detection (CBCD) is drawing increasing attention as an alternative technology to watermarking for video identification and copyright protection. In this article, we present a comprehensive method to detect copies that are subjected to complicated transformations. A multimodal feature representation scheme is designed to exploit the complementarity of audio features, global and local visual features so that optimal overall robustness to a wide range of complicated modifications can be achieved. Meanwhile, a temporal pyramid matching algorithm is proposed to assemble frame-level similarity search results into sequence-level matching results through similarity evaluation over multiple temporal granularities. Additionally, inverted indexing and locality sensitive hashing (LSH) are also adopted to speed up similarity search. Experimental results over benchmarking datasets of TRECVID 2010 and 2009 demonstrate that the proposed method outperforms other methods for most transformations in terms of copy detection accuracy. The evaluation results also suggest that our method can achieve competitive copy localization preciseness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2013:LSM, author = "Xiangyu Chen and Yadong Mu and Hairong Liu and Shuicheng Yan and Yong Rui and Tat-Seng Chua", title = "Large-scale multilabel propagation based on efficient sparse graph construction", journal = j-TOMCCAP, volume = "10", number = "1", pages = "6:1--6:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2542205.2542209", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the popularity of photo-sharing websites, the number of web images has exploded into unseen magnitude. Annotating such large-scale data will cost huge amount of human resources and is thus unaffordable. Motivated by this challenging problem, we propose a novel sparse graph based multilabel propagation (SGMP) scheme for super large scale datasets. Both the efficacy and accuracy of the image annotation are further investigated under different graph construction strategies, where Gaussian noise and non-Gaussian sparse noise are simultaneously considered in the formulations of these strategies. Our proposed approach outperforms the state-of-the-art algorithms by focusing on: (1) For large-scale graph construction, a simple yet efficient LSH (Locality Sensitive Hashing)-based sparse graph construction scheme is proposed to speed up the construction. We perform the multilabel propagation on this hashing-based graph construction, which is derived with LSH approach followed by sparse graph construction within the individual hashing buckets; (2) To further improve the accuracy, we propose a novel sparsity induced scalable graph construction scheme, which is based on a general sparse optimization framework. Sparsity essentially implies a very strong prior: for large scale optimization, the values of most variables shall be zeros when the solution reaches the optimum. By utilizing this prior, the solutions of large-scale sparse optimization problems can be derived by solving a series of much smaller scale subproblems; (3) For multilabel propagation, different from the traditional algorithms that propagate over individual label independently, our proposed propagation first encodes the label information of an image as a unit label confidence vector and naturally imposes inter-label constraints and manipulates labels interactively. Then, the entire propagation problem is formulated on the concept of Kullback--Leibler divergence defined on probabilistic distributions, which guides the propagation of the supervision information. Extensive experiments on the benchmark dataset NUS-WIDE with 270k images and its lite version NUS-WIDE-LITE with 56k images well demonstrate the effectiveness and scalability of the proposed multi-label propagation scheme.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Houle:2013:API, author = "Michael E. Houle and Vincent Oria and Shin'ichi Satoh and Jichao Sun", title = "Annotation propagation in image databases using similarity graphs", journal = j-TOMCCAP, volume = "10", number = "1", pages = "7:1--7:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2487736", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The practicality of large-scale image indexing and querying methods depends crucially upon the availability of semantic information. The manual tagging of images with semantic information is in general very labor intensive, and existing methods for automated image annotation may not always yield accurate results. The aim of this paper is to reduce to a minimum the amount of human intervention required in the semantic annotation of images, while preserving a high degree of accuracy. Ideally, only one copy of each object of interest would be labeled manually, and the labels would then be propagated automatically to all other occurrences of the objects in the database. To this end, we propose an influence propagation strategy, SW-KProp, that requires no human intervention beyond the initial labeling of a subset of the images. SW-KProp distributes semantic information within a similarity graph defined on all images in the database: each image iteratively transmits its current label information to its neighbors, and then readjusts its own label according to the combined influences of its neighbors. SW-KProp influence propagation can be efficiently performed by means of matrix computations, provided that pairwise similarities of images are available. We also propose a variant of SW-KProp which enhances the quality of the similarity graph by selecting a reduced feature set for each prelabeled image and rebuilding its neighborhood. The performances of the SW-KProp method and its variant were evaluated against several competing methods on classification tasks for three image datasets: a handwritten digit dataset, a face dataset and a web image dataset. For the digit images, SW-KProp and its variant performed consistently better than the other methods tested. For the face and web images, SW-KProp outperformed its competitors for the case when the number of prelabeled images was relatively small. The performance was seen to improve significantly when the feature selection strategy was applied.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mallik:2013:MOR, author = "Anupama Mallik and Hiranmay Ghosh and Santanu Chaudhury and Gaurav Harit", title = "{MOWL}: an ontology representation language for {Web}-based multimedia applications", journal = j-TOMCCAP, volume = "10", number = "1", pages = "8:1--8:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2542205.2542210", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Several multimedia applications need to reason with concepts and their media properties in specific domain contexts. Media properties of concepts exhibit some unique characteristics that cannot be dealt with conceptual modeling schemes followed in the existing ontology representation and reasoning schemes. We have proposed a new perceptual modeling technique for reasoning with media properties observed in multimedia instances and the latent concepts. Our knowledge representation scheme uses a causal model of the world where concepts manifest in media properties with uncertainties. We introduce a probabilistic reasoning scheme for belief propagation across domain concepts through observation of media properties. In order to support the perceptual modeling and reasoning paradigm, we propose a new ontology language, Multimedia Web Ontology Language (MOWL). Our primary contribution in this article is to establish the need for the new ontology language and to introduce the semantics of its novel language constructs. We establish the generality of our approach with two disparate knowledge-intensive applications involving reasoning with media properties of concepts.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Deng:2014:DLB, author = "Yunhua Deng and Rynson W. H. Lau", title = "Dynamic load balancing in distributed virtual environments using heat diffusion", journal = j-TOMCCAP, volume = "10", number = "2", pages = "16:1--16:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2499906", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Distributed virtual environments (DVEs) are attracting a lot of attention in recent years, due to the increasing popularity of online gaming and social networks. As the number of concurrent users of a DVE increases, a critical problem is on how the workload among multiple servers can be balanced in order to maintain real-time performance. Although a number of load balancing methods have been proposed, they either try to produce high quality load balancing results and become too slow or emphasize on efficiency and the load balancing results become less effective. In this article, we propose a new approach to address this problem based on heat diffusion. Our work has two main contributions. First, we propose a local and a global load balancing methods for DVEs based on heat diffusion. Second, we investigate two performance factors of the proposed methods, the convergence threshold and the load balancing interval. We have conducted a number of experiments to extensively evaluate the performance of the proposed methods. Our experimental results show that the proposed methods outperform existing methods in that our methods are effective in reducing server overloading while at the same time being efficient.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{She:2014:CID, author = "James She and Jon Crowcroft and Hao Fu and Flora Li", title = "Convergence of interactive displays with smart mobile devices for effective advertising: a survey", journal = j-TOMCCAP, volume = "10", number = "2", pages = "17:1--17:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2557450", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The trend of replacing public static signages with digital displays creates opportunities for interactive display systems, which can be used in collaborative workspaces, social gaming platforms and advertising. Based on marketing communication concepts and existing models for consumer behavior, three stages, namely attraction, interaction and conation, are defined in this article to analyze the effectiveness of interactive display advertising. By reviewing various methods and strategies employed by existing systems with attraction, interaction and conation stages, this article concludes that smart mobile devices should be integrated as a component to increase the effectiveness of interactive displays as advertising tools. Future research challenges related to this topic are also discussed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gonina:2014:SMC, author = "Ekaterina Gonina and Gerald Friedland and Eric Battenberg and Penporn Koanantakool and Michael Driscoll and Evangelos Georganas and Kurt Keutzer", title = "Scalable multimedia content analysis on parallel platforms using {Python}", journal = j-TOMCCAP, volume = "10", number = "2", pages = "18:1--18:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2517151", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this new era dominated by consumer-produced media there is a high demand for web-scalable solutions to multimedia content analysis. A compelling approach to making applications scalable is to explicitly map their computation onto parallel platforms. However, developing efficient parallel implementations and fully utilizing the available resources remains a challenge due to the increased code complexity, limited portability and required low-level knowledge of the underlying hardware. In this article, we present PyCASP, a Python-based framework that automatically maps computation onto parallel platforms from Python application code to a variety of parallel platforms. PyCASP is designed using a systematic, pattern-oriented approach to offer a single software development environment for multimedia content analysis applications. Using PyCASP, applications can be prototyped in a couple hundred lines of Python code and automatically scale to modern parallel processors. Applications written with PyCASP are portable to a variety of parallel platforms and efficiently scale from a single desktop Graphics Processing Unit (GPU) to an entire cluster with a small change to application code. To illustrate our approach, we present three multimedia content analysis applications that use our framework: a state-of-the-art speaker diarization application, a content-based music recommendation system based on the Million Song Dataset, and a video event detection system for consumer-produced videos. We show that across this wide range of applications, our approach achieves the goal of automatic portability and scalability while at the same time allowing easy prototyping in a high-level language and efficient performance of low-level optimized code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chandra:2014:HPM, author = "Surendar Chandra and John Boreczky and Lawrence A. Rowe", title = "High performance many-to-many intranet screen sharing with {DisplayCast}", journal = j-TOMCCAP, volume = "10", number = "2", pages = "19:1--19:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2534328", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "DisplayCast is a many to many Intranet screen sharing system. Its screen capture mechanism creates a sequence of pixmap images of the screen updates. Prior systems that used a similar approach were designed to operate over constrained wide-area networks and did not exploit the Intranet network conditions to achieve high capture rates. First we empirically analyzed the screen contents for a variety of scenarios. We showed that screen updates were sporadic with long periods of inactivity. When active, screens were updated at far higher rates than was supported by earlier systems. The mismatch was pronounced for interactive scenarios. Even during active screen updates, the number of updated pixels were frequently small. We showed that crucial information can be lost if individual updates were merged. When the available system resources could not support high capture rates, we showed ways in which updates can be effectively collapsed. Next, we investigate compression mechanisms for streaming these updates. Even while using a hardware encoder, lossy compressors such as H.264 were unable to sustain high frame rates. Though Zlib lossless compression operated within the latency and compression rate requirements, the compression efficiency was poor. By analyzing the screen pixels, we developed a practical transformation that significantly improved compression rates. DisplayCast incorporates these observations. It shares the processor and network resources required for screen capture, compression and transmission with host applications whose output needs to be shared. DisplayCast is agile and uses faster processing capability to achieve even higher performance. Our system components operate natively in Windows 7, Mac OS X and iOS and is deployed in a production setting. DisplayCast is released under a New BSD License.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lee:2014:NDH, author = "Ya-Lin Lee and Wen-Hsiang Tsai", title = "A new data hiding method via revision history records on collaborative writing platforms", journal = j-TOMCCAP, volume = "10", number = "2", pages = "20:1--20:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2534408", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A new data hiding method via collaboratively-written articles with forged revision history records on collaborative writing platforms is proposed. The hidden message is camouflaged as a stego-document consisting of a stego-article and a revision history created through a simulated process of collaborative writing. The revisions are forged using a database constructed by mining word sequences used in real cases from an English Wikipedia XML dump. Four characteristics of article revisions are identified and utilized to embed secret messages, including the author of each revision, the number of corrected word sequences, the content of the corrected word sequences, and the word sequences replacing the corrected ones. Related problems arising in utilizing these characteristics for data hiding are identified and solved skillfully, resulting in an effective multiway method for hiding secret messages into the revision history. To create more realistic revisions, Huffman coding based on the word sequence frequencies collected from Wikipedia is applied to encode the word sequences. Good experimental results show the feasibility of the proposed method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yuan:2014:MRB, author = "Jin Yuan and Yi-Liang Zhao and Huanbo Luan and Meng Wang and Tat-Seng Chua", title = "Memory recall based video search: Finding videos you have seen before based on your memory", journal = j-TOMCCAP, volume = "10", number = "2", pages = "21:1--21:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2534409", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We often remember images and videos that we have seen or recorded before but cannot quite recall the exact venues or details of the contents. We typically have vague memories of the contents, which can often be expressed as a textual description and/or rough visual descriptions of the scenes. Using these vague memories, we then want to search for the corresponding videos of interest. We call this ``Memory Recall based Video Search'' (MRVS). To tackle this problem, we propose a video search system that permits a user to input his/her vague and incomplete query as a combination of text query, a sequence of visual queries, and/or concept queries. Here, a visual query is often in the form of a visual sketch depicting the outline of scenes within the desired video, while each corresponding concept query depicts a list of visual concepts that appears in that scene. As the query specified by users is generally approximate or incomplete, we need to develop techniques to handle this inexact and incomplete specification by also leveraging on user feedback to refine the specification. We utilize several innovative approaches to enhance the automatic search. First, we employ a visual query suggestion model to automatically suggest potential visual features to users as better queries. Second, we utilize a color similarity matrix to help compensate for inexact color specification in visual queries. Third, we leverage on the ordering of visual queries and/or concept queries to rerank the results by using a greedy algorithm. Moreover, as the query is inexact and there is likely to be only one or few possible answers, we incorporate an interactive feedback loop to permit the users to label related samples which are visually similar or semantically close to the relevant sample. Based on the labeled samples, we then propose optimization algorithms to update visual queries and concept weights to refine the search results. We conduct experiments on two large-scale video datasets: TRECVID 2010 and YouTube. The experimental results demonstrate that our proposed system is effective for MRVS tasks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2014:MIK, author = "Xianglong Liu and Yadong Mu and Bo Lang and Shih-Fu Chang", title = "Mixed image-keyword query adaptive hashing over multilabel images", journal = j-TOMCCAP, volume = "10", number = "2", pages = "22:1--22:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2540990", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 13 07:37:57 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article defines a new hashing task motivated by real-world applications in content-based image retrieval, that is, effective data indexing and retrieval given mixed query (query image together with user-provided keywords). Our work is distinguished from state-of-the-art hashing research by two unique features: (1) Unlike conventional image retrieval systems, the input query is a combination of an exemplar image and several descriptive keywords, and (2) the input image data are often associated with multiple labels. It is an assumption that is more consistent with the realistic scenarios. The mixed image-keyword query significantly extends traditional image-based query and better explicates the user intention. Meanwhile it complicates semantics-based indexing on the multilabel data. Though several existing hashing methods can be adapted to solve the indexing task, unfortunately they all prove to suffer from low effectiveness. To enhance the hashing efficiency, we propose a novel scheme ``boosted shared hashing''. Unlike prior works that learn the hashing functions on either all image labels or a single label, we observe that the hashing function can be more effective if it is designed to index over an optimal label subset. In other words, the association between labels and hash bits are moderately sparse. The sparsity of the bit-label association indicates greatly reduced computation and storage complexities for indexing a new sample, since only limited number of hashing functions will become active for the specific sample. We develop a Boosting style algorithm for simultaneously optimizing both the optimal label subsets and hashing functions in a unified formulation, and further propose a query-adaptive retrieval mechanism based on hash bit selection for mixed queries, no matter whether or not the query words exist in the training data. Moreover, we show that the proposed method can be easily extended to the case where the data similarity is gauged by nonlinear kernel functions. Extensive experiments are conducted on standard image benchmarks like CIFAR-10, NUS-WIDE and a-TRECVID. The results validate both the sparsity of the bit-label association and the convergence of the proposed algorithm, and demonstrate that the proposed hashing scheme achieves substantially superior performances over state-of-the-art methods under the same hash bit budget.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Anonymous:2014:TCO, author = "Anonymous", title = "Table of Contents: Online Supplement Volume 10, Number 1s", journal = j-TOMCCAP, volume = "10", number = "3", pages = "22:1--22:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2602969", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2014:DUB, author = "Ning Liu and Huajie Cui and S.-H. Gary Chan and Zhipeng Chen and Yirong Zhuang", title = "Dissecting User Behaviors for a Simultaneous Live and {VoD IPTV} System", journal = j-TOMCCAP, volume = "10", number = "3", pages = "23:1--23:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2568194", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "IPTV services deployed nowadays often consist of both live TV and Video-on-Demand (VoD), offered by the same service provider to the same pool of users over the same managed network. Understanding user behaviors in such a setting is hence an important step for system modelling and optimization. Previous studies on user behavior on video services were on either live TV or VoD. For the first time, we conduct an in-depth large-scale behavior study for IPTV users offering simultaneously live TV and VoD choices at the same time. Our data is from the largest IPTV service provider in China, offering hundreds of live channels and hundreds of thousands of VoD files, with traces covering more than 1.9 million users over a period of 5 months. This large dataset provides us a unique opportunity to cross-compare user viewing behaviors for these services on the same platform, and sheds valuable insights on how users interact with such a simultaneous system. Our results lead to new understanding on IPTV user behaviors which have strong implications on system design. For example, we find that the average holding time for VoD is significantly longer than live TV. live TV users tend to surf more. However, if such channel surfing is discounted, the holding times of both services are not much different. While users in VoD tend to view HD longer, channel popularity for live TV is much less dependent on its video quality. In contrast to some popular assumptions on user interactivity, the transitions among live TV, VoD, and offline modes are far from a Markov model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gaeta:2014:DDI, author = "Rossano Gaeta and Marco Grangetto and Lorenzo Bovio", title = "{DIP}: {Distributed Identification of Polluters} in {P2P} Live Streaming", journal = j-TOMCCAP, volume = "10", number = "3", pages = "24:1--24:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2568223", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Peer-to-peer live streaming applications are vulnerable to malicious actions of peers that deliberately modify data to decrease or prevent the fruition of the media (pollution attack). In this article we propose DIP, a fully distributed, accurate, and robust algorithm for the identification of polluters. DIP relies on checks that are computed by peers upon completing reception of all blocks composing a data chunk. A check is a special message that contains the set of peer identifiers that provided blocks of the chunk as well as a bit to signal if the chunk has been corrupted. Checks are periodically transmitted by peers to their neighbors in the overlay network; peers receiving checks use them to maintain a factor graph. This graph is bipartite and an incremental belief propagation algorithm is run on it to compute the probability of a peer being a polluter. Using a prototype deployed over PlanetLab we show by extensive experimentation that DIP allows honest peers to identify polluters with very high accuracy and completeness, even when polluters collude to deceive them. Furthermore, we show that DIP is efficient, requiring low computational, communication, and storage overhead at each peer.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hoque:2014:SEM, author = "Mohammad Asharful Hoque and Matti Siekkinen and Jukka K. Nurminen and Sasu Tarkoma and Mika Aalto", title = "Saving Energy in Mobile Devices for On-Demand Multimedia Streaming --- A Cross-Layer Approach", journal = j-TOMCCAP, volume = "10", number = "3", pages = "25:1--25:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2556942", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes a novel energy-efficient multimedia delivery system called EStreamer. First, we study the relationship between buffer size at the client, burst-shaped TCP-based multimedia traffic, and energy consumption of wireless network interfaces in smartphones. Based on the study, we design and implement EStreamer for constant bit rate and rate-adaptive streaming. EStreamer can improve battery lifetime by 3x, 1.5x, and 2x while streaming over Wi-Fi, 3G, and 4G, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2014:HEK, author = "Feng Wang and Wan-Lei Zhao and Chong-Wah Ngo and Bernard Merialdo", title = "A {Hamming} Embedding Kernel with Informative Bag-of-Visual Words for Video Semantic Indexing", journal = j-TOMCCAP, volume = "10", number = "3", pages = "26:1--26:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2535938", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose a novel Hamming embedding kernel with informative bag-of-visual words to address two main problems existing in traditional BoW approaches for video semantic indexing. First, Hamming embedding is employed to alleviate the information loss caused by SIFT quantization. The Hamming distances between keypoints in the same cell are calculated and integrated into the SVM kernel to better discriminate different image samples. Second, to highlight the concept-specific visual information, we propose to weight the visual words according to their informativeness for detecting specific concepts. We show that our proposed kernels can significantly improve the performance of concept detection.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2014:MDF, author = "Ying Yang and Ioannis Ivrissimtzis", title = "Mesh Discriminative Features for {$3$D} Steganalysis", journal = j-TOMCCAP, volume = "10", number = "3", pages = "27:1--27:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2535555", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We propose a steganalytic algorithm for triangle meshes, based on the supervised training of a classifier by discriminative feature vectors. After a normalization step, the triangle mesh is calibrated by one step of Laplacian smoothing and then a feature vector is computed, encoding geometric information corresponding to vertices, edges and faces. For a given steganographic or watermarking algorithm, we create a training set containing unmarked meshes and meshes marked by that algorithm, and train a classifier using Quadratic Discriminant Analysis. The performance of the proposed method was evaluated on six well-known watermarking/steganographic schemes with satisfactory accuracy rates.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hamam:2014:QEM, author = "Abdelwahab Hamam and Abdulmotaleb {El Saddik} and Jihad Alja'am", title = "A Quality of Experience Model for Haptic Virtual Environments", journal = j-TOMCCAP, volume = "10", number = "3", pages = "28:1--28:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2540991", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Haptic-based Virtual Reality (VR) applications have many merits. What is still obscure, from the designer's perspective of these applications, is the experience the users will undergo when they use the VR system. Quality of Experience (QoE) is an evaluation metric from the user's perspective that unfortunately has received limited attention from the research community. Assessing the QoE of VR applications reflects the amount of overall satisfaction and benefits gained from the application in addition to laying the foundation for ideal user-centric design in the future. In this article, we propose a taxonomy for the evaluation of QoE for multimedia applications and in particular VR applications. We model this taxonomy using a Fuzzy logic Inference System (FIS) to quantitatively measure the QoE of haptic virtual environments. We build and test our FIS by conducting a users' study analysis to evaluate the QoE of a haptic game application. Our results demonstrate that the proposed FIS model reflects the user's estimation of the application's quality significantly with low error and hence is suited for QoE evaluation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Botta:2014:PCI, author = "Marco Botta and Davide Cavagnino and Victor Pomponiu", title = "Protecting the Content Integrity of Digital Imagery with Fidelity Preservation: An Improved Version", journal = j-TOMCCAP, volume = "10", number = "3", pages = "29:1--29:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2568224", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Fragile watermarking has attracted a lot of attention in the last decade. An interesting approach, presented in 2011 by Lin et al., results in very high quality of the watermarked images. However, after a thorough examination of the paper, a few improvements are proposed in our revised version of the algorithm in order to overcome some shortcomings. In particular, changes to the pseudocode and modifications to deal with pixel saturation are suggested, along with a way to improve the scheme security. Finally, a deeper analysis of the security is presented.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Luo:2014:ICH, author = "Da Luo and Weiqi Luo and Rui Yang and Jiwu Huang", title = "Identifying Compression History of Wave Audio and Its Applications", journal = j-TOMCCAP, volume = "10", number = "3", pages = "30:1--30:??", month = apr, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2575978", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Apr 15 12:20:53 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Audio signal is sometimes stored and/or processed in WAV (waveform) format without any knowledge of its previous compression operations. To perform some subsequent processing, such as digital audio forensics, audio enhancement and blind audio quality assessment, it is necessary to identify its compression history. In this article, we will investigate how to identify a decompressed wave audio that went through one of three popular compression schemes, including MP3, WMA (windows media audio) and AAC (advanced audio coding). By analyzing the corresponding frequency coefficients, including modified discrete cosine transform (MDCT) and Mel-frequency cepstral coefficients (MFCCs), of those original audio clips and their decompressed versions with different compression schemes and bit rates, we propose several statistics to identify the compression scheme as well as the corresponding bit rate previously used for a given WAV signal. The experimental results evaluated on 8,800 audio clips with various contents have shown the effectiveness of the proposed method. In addition, some potential applications of the proposed method are discussed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } %%% ==================================================================== %%% From the ACM Portal Web site: ``On 23rd May 2014, ACM TOMCCAP %%% changed its acronym to ACM TOMM. This acronym change was the result %%% of extensive discussions between the journal Editorial Board and %%% SIGMM constituents dating back to 2011. This name change emphasizes %%% the continued strong collaboration with the ACM Multimedia %%% conference (ACMMM).'' %%% ==================================================================== @Article{Zhang:2014:CDM, author = "Tianzhu Zhang and Changsheng Xu", title = "Cross-Domain Multi-Event Tracking via {CO-PMHT}", journal = j-TOMM, volume = "10", number = "4", pages = "31:1--31:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2602633", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the massive growth of events on the Internet, efficient organization and monitoring of events becomes a practical challenge. To deal with this problem, we propose a novel CO-PMHT (CO-Probabilistic Multi-Hypothesis Tracking) algorithm for cross-domain multi-event tracking to obtain their informative summary details and evolutionary trends over time. We collect a large-scale dataset by searching keywords on two domains (Gooogle News and Flickr) and downloading both images and textual content for an event. Given the input data, our algorithm can track multiple events in the two domains collaboratively and boost the tracking performance. Specifically, the bridge between two domains is a semantic posterior probability, that avoids the domain gap. After tracking, we can visualize the whole evolutionary process of the event over time and mine the semantic topics of each event for deep understanding and event prediction. The extensive experimental evaluations on the collected dataset well demonstrate the effectiveness of the proposed algorithm for cross-domain multi-event tracking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2014:PVR, author = "Qinghua Huang and Bisheng Chen and Jingdong Wang and Tao Mei", title = "Personalized Video Recommendation through Graph Propagation", journal = j-TOMM, volume = "10", number = "4", pages = "32:1--32:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2598779", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The rapid growth of the number of videos on the Internet provides enormous potential for users to find content of interest. However, the vast quantity of videos also turns the finding process into a difficult task. In this article, we address the problem of providing personalized video recommendation for users. Rather than only exploring the user-video bipartite graph that is formulated using click information, we first combine the clicks and queries information to build a tripartite graph. In the tripartite graph, the query nodes act as bridges to connect user nodes and video nodes. Then, to further enrich the connections between users and videos, three subgraphs between the same kinds of nodes are added to the tripartite graph by exploring content-based information (video tags and textual queries). We propose an iterative propagation algorithm over the enhanced graph to compute the preference information of each user. Experiments conducted on a dataset with 1,369 users, 8,765 queries, and 17,712 videos collected from a commercial video search engine demonstrate the effectiveness of the proposed method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2014:UVS, author = "Haitao Li and Xu Cheng and Jiangchuan Liu", title = "Understanding Video Sharing Propagation in Social Networks: Measurement and Analysis", journal = j-TOMM, volume = "10", number = "4", pages = "33:1--33:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2594440", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Modern online social networking has drastically changed the information distribution landscape. Recently, video has become one of the most important types of objects spreading among social networking service users. The sheer and ever-increasing data volume, the broader coverage, and the longer access durations of video objects, however, present significantly more challenges than other types of objects. This article takes an initial step toward understanding the unique characteristics of video sharing propagation in social networks. Based on realworld data traces from a large-scale online social network, we examine the user behavior from diverse aspects and identify different types of users involved in video propagation. We closely investigate the temporal distribution during propagation as well as the typical propagation structures, revealing more details beyond stationary coverage. We further extend the conventional epidemic models to accommodate diverse types of users and their probabilistic viewing and sharing behaviors. The model, effectively capturing the essentials of the propagation process, serves as a valuable basis for such applications as workload synthesis, traffic prediction, and resource provision of video servers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2014:BCM, author = "Zhiyu Wang and Peng Cui and Lexing Xie and Wenwu Zhu and Yong Rui and Shiqiang Yang", title = "Bilateral Correspondence Model for Words-and-Pictures Association in Multimedia-Rich Microblogs", journal = j-TOMM, volume = "10", number = "4", pages = "34:1--34:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2611388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Nowadays, the amount of multimedia contents in microblogs is growing significantly. More than 20\% of microblogs link to a picture or video in certain large systems. The rich semantics in microblogs provides an opportunity to endow images with higher-level semantics beyond object labels. However, this raises new challenges for understanding the association between multimodal multimedia contents in multimedia-rich microblogs. Disobeying the fundamental assumptions of traditional annotation, tagging, and retrieval systems, pictures and words in multimedia-rich microblogs are loosely associated and a correspondence between pictures and words cannot be established. To address the aforementioned challenges, we present the first study analyzing and modeling the associations between multimodal contents in microblog streams, aiming to discover multimodal topics from microblogs by establishing correspondences between pictures and words in microblogs. We first use a data-driven approach to analyze the new characteristics of the words, pictures, and their association types in microblogs. We then propose a novel generative model called the Bilateral Correspondence Latent Dirichlet Allocation (BC-LDA) model. Our BC-LDA model can assign flexible associations between pictures and words and is able to not only allow picture-word co-occurrence with bilateral directions, but also single modal association. This flexible association can best fit the data distribution, so that the model can discover various types of joint topics and generate pictures and words with the topics accordingly. We evaluate this model extensively on a large-scale real multimedia-rich microblogs dataset. We demonstrate the advantages of the proposed model in several application scenarios, including image tagging, text illustration, and topic discovery. The experimental results demonstrate that our proposed model can significantly and consistently outperform traditional approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lei:2014:FND, author = "Yanqiang Lei and Guoping Qiu and Ligang Zheng and Jiwu Huang", title = "Fast Near-Duplicate Image Detection Using Uniform Randomized Trees", journal = j-TOMM, volume = "10", number = "4", pages = "35:1--35:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2602186", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Indexing structure plays an important role in the application of fast near-duplicate image detection, since it can narrow down the search space. In this article, we develop a cluster of uniform randomized trees (URTs) as an efficient indexing structure to perform fast near-duplicate image detection. The main contribution in this article is that we introduce ``uniformity'' and ``randomness'' into the indexing construction. The uniformity requires classifying the object images into the same scale subsets. Such a decision makes good use of the two facts in near-duplicate image detection, namely: (1) the number of categories is huge; (2) a single category usually contains only a small number of images. Therefore, the uniform distribution is very beneficial to narrow down the search space and does not significantly degrade the detection accuracy. The randomness is embedded into the generation of feature subspace and projection direction, improving the flexibility of indexing construction. The experimental results show that the proposed method is more efficient than the popular locality-sensitive hashing and more stable and flexible than the traditional KD-tree.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yeh:2014:PPR, author = "Che-Hua Yeh and Brian A. Barsky and Ming Ouhyoung", title = "Personalized Photograph Ranking and Selection System Considering Positive and Negative User Feedback", journal = j-TOMM, volume = "10", number = "4", pages = "36:1--36:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2584105", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose a novel personalized ranking system for amateur photographs. The proposed framework treats the photograph assessment as a ranking problem and we introduce the idea of personalized ranking, which ranks photographs considering both their aesthetic qualities and personal preferences. Photographs are described using three types of features: photo composition, color and intensity distribution, and personalized features. An aesthetic prediction model is learned from labeled photographs by using the proposed image features and RBF-ListNet learning algorithm. The experimental results show that the proposed framework outperforms in the ranking performance: a Kendall's tau value of 0.432 is significantly higher than those obtained by the features proposed in one of the state-of-the-art approaches (0.365) and by learning based on support vector regression (0.384). To realize personalization in ranking, three approaches are proposed: the feature-based approach allows users to select photographs with specific rules, the example-based approach takes the positive feedback from users to rerank the photograph, and the list-based approach takes both positive and negative feedback from users into consideration. User studies indicate that all three approaches are effective in both aesthetic and personalized ranking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tan:2014:PVS, author = "Song Tan and Yu-Gang Jiang and Chong-Wah Ngo", title = "Placing Videos on a Semantic Hierarchy for Search Result Navigation", journal = j-TOMM, volume = "10", number = "4", pages = "37:1--37:??", month = jun, year = "2014", DOI = "https://doi.org/10.1145/2578394", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 8 11:32:58 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Organizing video search results in a list view is widely adopted by current commercial search engines, which cannot support efficient browsing for complex search topics that have multiple semantic facets. In this article, we propose to organize video search results in a highly structured way. Specifically, videos are placed on a semantic hierarchy that accurately organizes various facets of a given search topic. To pick the most suitable videos for each node of the hierarchy, we define and utilize three important criteria: relevance, uniqueness, and diversity. Extensive evaluations on a large YouTube video dataset demonstrate the effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Steinmetz:2014:EN, author = "Ralf Steinmetz", title = "Editorial Note", journal = j-TOMM, volume = "11", number = "1", pages = "1:1--1:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2634234", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2014:SBA, author = "Yong-Jin Liu and Cui-Xia Ma and Qiufang Fu and Xiaolan Fu and Sheng-Feng Qin and Lexing Xie", title = "A Sketch-Based Approach for Interactive Organization of Video Clips", journal = j-TOMM, volume = "11", number = "1", pages = "2:1--2:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2645643", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid growth of video resources, techniques for efficient organization of video clips are becoming appealing in the multimedia domain. In this article, a sketch-based approach is proposed to intuitively organize video clips by: (1) enhancing their narrations using sketch annotations and (2) structurizing the organization process by gesture-based free-form sketching on touch devices. There are two main contributions of this work. The first is a sketch graph, a novel representation for the narrative structure of video clips to facilitate content organization. The second is a method to perform context-aware sketch recommendation scalable to large video collections, enabling common users to easily organize sketch annotations. A prototype system integrating the proposed approach was evaluated on the basis of five different aspects concerning its performance and usability. Two sketch searching experiments showed that the proposed context-aware sketch recommendation outperforms, in terms of accuracy and scalability, two state-of-the-art sketch searching methods. Moreover, a user study showed that the sketch graph is consistently preferred over traditional representations such as keywords and keyframes. The second user study showed that the proposed approach is applicable in those scenarios where the video annotator and organizer were the same person. The third user study showed that, for video content organization, using sketch graph users took on average 1/3 less time than using a mass-market tool Movie Maker and took on average 1/4 less time than using a state-of-the-art sketch alternative. These results demonstrated that the proposed sketch graph approach is a promising video organization tool.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2014:CSA, author = "Junshi Huang and Si Liu and Junliang Xing and Tao Mei and Shuicheng Yan", title = "Circle \& Search: Attribute-Aware Shoe Retrieval", journal = j-TOMM, volume = "11", number = "1", pages = "3:1--3:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632165", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Taking the shoe as a concrete example, we present an innovative product retrieval system that leverages object detection and retrieval techniques to support a brand-new online shopping experience in this article. The system, called Circle \& Search, enables users to naturally indicate any preferred product by simply circling the product in images as the visual query, and then returns visually and semantically similar products to the users. The system is characterized by introducing attributes in both the detection and retrieval of the shoe. Specifically, we first develop an attribute-aware part-based shoe detection model. By maintaining the consistency between shoe parts and attributes, this shoe detector has the ability to model high-order relations between parts and thus the detection performance can be enhanced. Meanwhile, the attributes of this detected shoe can also be predicted as the semantic relations between parts. Based on the result of shoe detection, the system ranks all the shoes in the repository using an attribute refinement retrieval model that takes advantage of query-specific information and attribute correlation to provide an accurate and robust shoe retrieval. To evaluate this retrieval system, we build a large dataset with 17,151 shoe images, in which each shoe is annotated with 10 shoe attributes e.g., heel height, heel shape, sole shape, etc.. According to the experimental result and the user study, our Circle \& Search system achieves promising shoe retrieval performance and thus significantly improves the users' online shopping experience.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Guan:2014:TAV, author = "Genliang Guan and Zhiyong Wang and Shaohui Mei and Max Ott and Mingyi He and David Dagan Feng", title = "A Top-Down Approach for Video Summarization", journal = j-TOMM, volume = "11", number = "1", pages = "4:1--4:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632267", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While most existing video summarization approaches aim to identify important frames of a video from either a global or local perspective, we propose a top-down approach consisting of scene identification and scene summarization. For scene identification, we represent each frame with global features and utilize a scalable clustering method. We then formulate scene summarization as choosing those frames that best cover a set of local descriptors with minimal redundancy. In addition, we develop a visual word-based approach to make our approach more computationally scalable. Experimental results on two benchmark datasets demonstrate that our proposed approach clearly outperforms the state-of-the-art.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pazzi:2014:PPP, author = "Richard W. Pazzi and Azzedine Boukerche", title = "{PROPANE}: a Progressive Panorama Streaming Protocol to Support Interactive {$3$D} Virtual Environment Exploration on Graphics-Constrained Devices", journal = j-TOMM, volume = "11", number = "1", pages = "5:1--5:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2602222", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image-Based Rendering (IBR) has become widely known by its relatively low requirements for generating new scenes based on a sequence of reference images. This characteristic of IBR shows a remarkable potential impact in rendering complex 3D virtual environments on graphics-constrained devices, such as head-mounted displays, set-top boxes, media streaming devices, and so on. If well exploited, IBR coupled with remote rendering would enable the exploration of complex virtual environments on these devices. However, remote rendering requires the transmission of a large volume of images. In addition, existing solutions consider limited and/or deterministic navigation schemes as a means of decreasing the volume of streamed data. This article proposes the PROgressive PANorama StrEaming protocol (PROPANE) to offer users a smoother virtual navigation experience by prestreaming the imagery data required to generate new views as the user wanders within a 3D environment. PROPANE is based on a very simple yet effective trigonometry model and uses a strafe (lateral movement) technique to minimize the delay between image updates at the client end. This article introduces the concept of key partial panoramas, namely panorama segments that cover movements in any direction by simply strafing from an appropriate key partial panorama and streaming the amount of lost pixels. Therefore, PROPANE can provide a constrained device with sufficient imagery data to cover a future user's viewpoints, thereby minimizing the impact of transmission delay and jitter. PROPANE has been implemented and compared to two baseline remote rendering schemes. The evaluation results show that the proposed technique outperforms the selected and closely related existing schemes by minimizing the response time while not limiting the user to predefined paths as opposed to previous protocols.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2014:FEM, author = "Xiangyu Wang and Yong Rui and Mohan Kankanhalli", title = "{Up-Fusion}: an Evolving Multimedia Fusion Method", journal = j-TOMM, volume = "11", number = "1", pages = "6:1--6:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2611777", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The amount of multimedia data on the Internet has increased exponentially in the past few decades and this trend is likely to continue. Multimedia content inherently has multiple information sources, therefore effective fusion methods are critical for data analysis and understanding. So far, most of the existing fusion methods are static with respect to time, making it difficult for them to handle the evolving multimedia content. To address this issue, in recent years, several evolving fusion methods were proposed, however, their requirements are difficult to meet, making them useful only in limited applications. In this article, we propose a novel evolving fusion method based on the online portfolio selection theory. The proposed method takes into account the correlation among different information sources and evolves the fusion model when new multimedia data is added. It performs effectively on both crisp and soft decisions without requiring additional context information. Extensive experiments on concept detection and human detection tasks over the TRECVID dataset and surveillance data have been conducted and significantly better performance has been obtained.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2014:EIP, author = "Xinxi Wang and Yi Wang and David Hsu and Ye Wang", title = "Exploration in Interactive Personalized Music Recommendation: a Reinforcement Learning Approach", journal = j-TOMM, volume = "11", number = "1", pages = "7:1--7:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2623372", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current music recommender systems typically act in a greedy manner by recommending songs with the highest user ratings. Greedy recommendation, however, is suboptimal over the long term: it does not actively gather information on user preferences and fails to recommend novel songs that are potentially interesting. A successful recommender system must balance the needs to explore user preferences and to exploit this information for recommendation. This article presents a new approach to music recommendation by formulating this exploration-exploitation trade-off as a reinforcement learning task. To learn user preferences, it uses a Bayesian model that accounts for both audio content and the novelty of recommendations. A piecewise-linear approximation to the model and a variational inference algorithm help to speed up Bayesian inference. One additional benefit of our approach is a single unified model for both music recommendation and playlist generation. We demonstrate the strong potential of the proposed approach with simulation results and a user study.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Katti:2014:OEE, author = "Harish Katti and Anoop Kolar Rajagopal and Mohan Kankanhalli and Ramakrishnan Kalpathi", title = "Online Estimation of Evolving Human Visual Interest", journal = j-TOMM, volume = "11", number = "1", pages = "8:1--8:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632284", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Sep 1 12:38:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Regions in video streams attracting human interest contribute significantly to human understanding of the video. Being able to predict salient and informative Regions of Interest (ROIs) through a sequence of eye movements is a challenging problem. Applications such as content-aware retargeting of videos to different aspect ratios while preserving informative regions and smart insertion of dialog (closed-caption text)$^1$ into the video stream can significantly be improved using the predicted ROIs. We propose an interactive human-in-the-loop framework to model eye movements and predict visual saliency into yet-unseen frames. Eye tracking and video content are used to model visual attention in a manner that accounts for important eye-gaze characteristics such as temporal discontinuities due to sudden eye movements, noise, and behavioral artifacts. A novel statistical- and algorithm-based method gaze buffering is proposed for eye-gaze analysis and its fusion with content-based features. Our robust saliency prediction is instantiated for two challenging and exciting applications. The first application alters video aspect ratios on-the-fly using content-aware video retargeting, thus making them suitable for a variety of display sizes. The second application dynamically localizes active speakers and places dialog captions on-the-fly in the video stream. Our method ensures that dialogs are faithful to active speaker locations and do not interfere with salient content in the video stream. Our framework naturally accommodates personalisation of the application to suit biases and preferences of individual users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghinea:2014:ISI, author = "Gheorghita Ghinea and Christian Timmerer and Weisi Lin and Stephen Gulliver", title = "Introduction to Special Issue on Multiple Sensorial {(MulSeMedia)} Multimodal Media: Advances and Applications", journal = j-TOMM, volume = "11", number = "1s", pages = "9:1--9:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661333", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lv:2014:MHF, author = "Zhihan Lv and Alaa Halawani and Shengzhong Feng and Haibo Li and Shafiq Ur R{\'e}hman", title = "Multimodal Hand and Foot Gesture Interaction for Handheld Devices", journal = j-TOMM, volume = "11", number = "1s", pages = "10:1--10:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2645860", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present a hand-and-foot-based multimodal interaction approach for handheld devices. Our method combines input modalities (i.e., hand and foot) and provides a coordinated output to both modalities along with audio and video. Human foot gesture is detected and tracked using contour-based template detection (CTD) and Tracking-Learning-Detection (TLD) algorithm. 3D foot pose is estimated from passive homography matrix of the camera. 3D stereoscopic and vibrotactile are used to enhance the immersive feeling. We developed a multimodal football game based on the multimodal approach as a proof-of-concept. We confirm our systems user satisfaction through a user study.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Prasad:2014:DVC, author = "Manoj Prasad and Murat Russell and Tracy A. Hammond", title = "Designing Vibrotactile Codes to Communicate Verb Phrases", journal = j-TOMM, volume = "11", number = "1s", pages = "11:1--11:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637289", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Soldiers, to guard themselves from enemy assault, have to maintain visual and auditory awareness of their environment. Their visual and auditory senses are thus saturated. This makes these channels less usable for communication. The tactile medium of communication with users is appropriate for displaying information in such situations. Research in interpersonal communication among soldiers shows that the most common form of communication between soldiers involves the use of verb phrases. In this article, we have developed a three-by-three tactile display and proposed a method for mapping the components of a verb phrase to two dimensions of tactile codes-shape and waveform. Perception of tactile codes by users depends on the ability of users to distinguish shape and waveform of the code. We have proposed a measure to rate the distinguish-ability of any two shapes and created a graph-based user-centric model using this measure to select distinguishable shapes from a set of all presentable shapes. We conducted two user studies to evaluate the ability of users to perceive tactile information. The results from our first study showed users' ability to perceive tactile shapes, tactile waveforms, and form verb phrases from tactile codes. The recognition accuracy and time taken to distinguish were better when the shapes were selected from the graph model than when shapes were chosen based on intuition. The second user study was conducted to test the performance of users while performing a primary visual task simultaneously with a secondary audio or haptic task. Users were more familiar with perceiving information from an auditory medium than from a haptic medium, which was reflected in their performance. Thus the performance of users in the primary visual task was better while using an audio medium of communication than while using a haptic medium of communication.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Murray:2014:MSE, author = "Niall Murray and Brian Lee and Yuansong Qiao and Gabriel-Miro Muntean", title = "Multiple-Scent Enhanced Multimedia Synchronization", journal = j-TOMM, volume = "11", number = "1s", pages = "12:1--12:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637293", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study looked at users' perception of interstream synchronization between audiovisual media and two olfactory streams. The ability to detect skews and the perception and impact of skews on user Quality of Experience (QoE) is analyzed. The olfactory streams are presented with the same skews (i.e., delay) and with variable skews (i.e., jitter and mix of scents). This article reports the limits beyond which desynchronization reduces user-perceived quality levels. Also, a minimum gap between the presentations of consecutive scents is identified, necessary to ensuring enhanced user-perceived quality. There is no evidence (not considering scent type) that overlapping or mixing of scents increases user QoE levels for olfaction-enhanced multimedia.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kroupi:2014:ECP, author = "Eleni Kroupi and Ashkan Yazdani and Jean-Marc Vesin and Touradj Ebrahimi", title = "{EEG} Correlates of Pleasant and Unpleasant Odor Perception", journal = j-TOMM, volume = "11", number = "1s", pages = "13:1--13:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637287", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Olfaction-enhanced multimedia experience is becoming vital for strengthening the sensation of reality and the quality of user experience. One approach to investigate olfactory perception is to analyze the alterations in brain activity during stimulation with different odors. In this article, the changes in the electroencephalogram (EEG) when perceiving hedonically-different odors are studied. Results of within and across-subject analysis are presented. We show that EEG-based odor classification using brain activity is possible and can be used to automatically recognize odor pleasantness when a subject-specific classifier is trained. However, it is a challenging problem to design a generic classifier.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rainer:2014:GUM, author = "Benjamin Rainer and Christian Timmerer", title = "A Generic Utility Model Representing the Quality of Sensory Experience", journal = j-TOMM, volume = "11", number = "1s", pages = "14:1--14:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2648429", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current QoE research is mainly focusing on single modalities (audio, visual) or combinations thereof. In our research, we propose annotating traditional multimedia content with additional sensory effects, such as ambient light, vibration, wind, and olfaction, which could potentially stimulate all human senses. Investigating the influence of individual sensory effects and combinations thereof is important in order to understand how these individual sensory effects influence the Quality of Experience (QoE) as a whole. In this article, we describe the results of such a subjective quality assessment of audio-visual sequences which are annotated with additional sensory effects such as ambient light, wind, and vibration using the MPEG-V standard. The results of this assessment allow us to derive a utility model representing the Quality of Sensory Experience (QuaSE) complementary to existing QoE models described in terms of Quality of Service (QoS) parameters. For validating our proposed utility model, we provide an example instantiation and validate it against results of subjective quality assessments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yuan:2014:UQE, author = "Zhenhui Yuan and Shengyang Chen and Gheorghita Ghinea and Gabriel-Miro Muntean", title = "User Quality of Experience of Mulsemedia Applications", journal = j-TOMM, volume = "11", number = "1s", pages = "15:1--15:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661329", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "User Quality of Experience (QoE) is of fundamental importance in multimedia applications and has been extensively studied for decades. However, user QoE in the context of the emerging multiple-sensorial media (mulsemedia) services, which involve different media components than the traditional multimedia applications, have not been comprehensively studied. This article presents the results of subjective tests which have investigated user perception of mulsemedia content. In particular, the impact of intensity of certain mulsemedia components including haptic and airflow on user-perceived experience are studied. Results demonstrate that by making use of mulsemedia the overall user enjoyment levels increased by up to 77\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Luque:2014:IMS, author = "Francisco Pedro Luque and Iris Galloso and Claudio Feijoo and Carlos Alberto Mart{\'\i}n and Guillermo Cisneros", title = "Integration of Multisensorial Stimuli and Multimodal Interaction in a Hybrid {$3$DTV} System", journal = j-TOMM, volume = "11", number = "1s", pages = "16:1--16:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617992", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes the integration of multisensorial stimuli and multimodal interaction components into a sports multimedia asset under two dimensions: immersion and interaction. The first dimension comprises a binaural audio system and a set of sensory effects synchronized with the audiovisual content, whereas the second explores interaction through the insertion of interactive 3D objects into the main screen and on-demand presentation of additional information in a second touchscreen. We present an end-to-end solution integrating these components into a hybrid (internet-broadcast) television system using current 3DTV standards. Results from an experimental study analyzing the perceived quality of these stimuli and their influence on the Quality of Experience are presented.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ghinea:2014:MSA, author = "Gheorghita Ghinea and Christian Timmerer and Weisi Lin and Stephen R. Gulliver", title = "Mulsemedia: State of the Art, Perspectives, and Challenges", journal = j-TOMM, volume = "11", number = "1s", pages = "17:1--17:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617994", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mulsemedia-multiple sensorial media-captures a wide variety of research efforts and applications. This article presents a historic perspective on mulsemedia work and reviews current developments in the area. These take place across the traditional multimedia spectrum-from virtual reality applications to computer games-as well as efforts in the arts, gastronomy, and therapy, to mention a few. We also describe standardization efforts, via the MPEG-V standard, and identify future developments and exciting challenges the community needs to overcome.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zha:2014:ISI, author = "Zheng-Jun Zha and Lei Zhang and Max M{\"u}hlh{\"a}user and Alan F. Smeaton", title = "Introduction to the Special Issue Best Papers of {ACM Multimedia 2013}", journal = j-TOMM, volume = "11", number = "1s", pages = "18:1--18:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661331", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Fang:2014:DGI, author = "Quan Fang and Jitao Sang and Changsheng Xu", title = "Discovering Geo-Informative Attributes for Location Recognition and Exploration", journal = j-TOMM, volume = "11", number = "1s", pages = "19:1--19:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2648581", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article considers the problem of automatically discovering geo-informative attributes for location recognition and exploration. The attributes are expected to be both discriminative and representative, which correspond to certain distinctive visual patterns and associate with semantic interpretations. For our solution, we analyze the attribute at the region level. Each segmented region in the training set is assigned a binary latent variable indicating its discriminative capability. A latent learning framework is proposed for discriminative region detection and geo-informative attribute discovery. Moreover, we use user-generated content to obtain the semantic interpretation for the discovered visual attributes. Discriminative and search-based attribute annotation methods are developed for geo-informative attribute interpretation. The proposed approach is evaluated on one challenging dataset including GoogleStreetView and Flickr photos. Experimental results show that (1) geo-informative attributes are discriminative and useful for location recognition; (2) the discovered semantic interpretation is meaningful and can be exploited for further location exploration.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2014:WYB, author = "Luoqi Liu and Junliang Xing and Si Liu and Hui Xu and Xi Zhou and Shuicheng Yan", title = "{``Wow! You Are So Beautiful Today!''}", journal = j-TOMM, volume = "11", number = "1s", pages = "20:1--20:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2659234", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Beauty e-Experts, a fully automatic system for makeover recommendation and synthesis, is developed in this work. The makeover recommendation and synthesis system simultaneously considers many kinds of makeover items on hairstyle and makeup. Given a user-provided frontal face image with short/bound hair and no/light makeup, the Beauty e-Experts system not only recommends the most suitable hairdo and makeup, but also synthesizes the virtual hairdo and makeup effects. To acquire enough knowledge for beauty modeling, we built the Beauty e-Experts Database, which contains 1,505 female photos with a variety of attributes annotated with different discrete values. We organize these attributes into two different categories, beauty attributes and beauty-related attributes. Beauty attributes refer to those values that are changeable during the makeover process and thus need to be recommended by the system. Beauty-related attributes are those values that cannot be changed during the makeup process but can help the system to perform recommendation. Based on this Beauty e-Experts Dataset, two problems are addressed for the Beauty e-Experts system: what to recommend and how to wear it, which describes a similar process of selecting hairstyle and cosmetics in daily life. For the what-to-recommend problem, we propose a multiple tree-structured supergraph model to explore the complex relationships among high-level beauty attributes, mid-level beauty-related attributes, and low-level image features. Based on this model, the most compatible beauty attributes for a given facial image can be efficiently inferred. For the how-to-wear-it problem, an effective and efficient facial image synthesis module is designed to seamlessly synthesize the recommended makeovers into the user facial image. We have conducted extensive experiments on testing images of various conditions to evaluate and analyze the proposed system. The experimental results well demonstrate the effectiveness and efficiency of the proposed system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2014:AAS, author = "Hanwang Zhang and Zheng-Jun Zha and Yang Yang and Shuicheng Yan and Yue Gao and Tat-Seng Chua", title = "Attribute-Augmented Semantic Hierarchy: Towards a Unified Framework for Content-Based Image Retrieval", journal = j-TOMM, volume = "11", number = "1s", pages = "21:1--21:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637291", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel attribute-augmented semantic hierarchy (A$^2$ SH) and demonstrates its effectiveness in bridging both the semantic and intention gaps in content-based image retrieval (CBIR). A$^2$ SH organizes semantic concepts into multiple semantic levels and augments each concept with a set of related attributes. The attributes are used to describe the multiple facets of the concept and act as the intermediate bridge connecting the concept and low-level visual content. An hierarchical semantic similarity function is learned to characterize the semantic similarities among images for retrieval. To better capture user search intent, a hybrid feedback mechanism is developed, which collects hybrid feedback on attributes and images. This feedback is then used to refine the search results based on A$^2$ SH. We use A$^2$ SH as a basis to develop a unified content-based image retrieval system. We conduct extensive experiments on a large-scale dataset of over one million Web images. Experimental results show that the proposed A$^2$ SH can characterize the semantic affinities among images accurately and can shape user search intent quickly, leading to more accurate search results as compared to state-of-the-art CBIR solutions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2014:SSS, author = "Xin Zhao and Xue Li and Chaoyi Pang and Quan Z. Sheng and Sen Wang and Mao Ye", title = "Structured Streaming Skeleton --- A New Feature for Online Human Gesture Recognition", journal = j-TOMM, volume = "11", number = "1s", pages = "22:1--22:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2648583", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online human gesture recognition has a wide range of applications in computer vision, especially in human-computer interaction applications. The recent introduction of cost-effective depth cameras brings a new trend of research on body-movement gesture recognition. However, there are two major challenges: (i) how to continuously detect gestures from unsegmented streams, and (ii) how to differentiate different styles of the same gesture from other types of gestures. In this article, we solve these two problems with a new effective and efficient feature extraction method-Structured Streaming Skeleton (SSS)-which uses a dynamic matching approach to construct a feature vector for each frame. Our comprehensive experiments on MSRC-12 Kinect Gesture, Huawei/3DLife-2013, and MSR-Action3D datasets have demonstrated superior performances than the state-of-the-art approaches. We also demonstrate model selection based on the proposed SSS feature, where the classifier of squared loss regression with l$_{2, 1}$ norm regularization is a recommended classifier for best performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Carbunar:2014:EFN, author = "Bogdan Carbunar and Rahul Potharaju and Michael Pearce and Venugopal Vasudevan and Michael Needham", title = "Errata for: {A Framework for Network Aware Caching for Video on Demand Systems}", journal = j-TOMM, volume = "11", number = "1s", pages = "23:1--23:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661298", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Oct 3 12:44:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", note = "See \cite{Carbunar:2013:FNA}.", abstract = "Some errors were introduced into this article in the preparation of the final source files. The errors are summarized in the following text and revised pages with the corrected elements indicated in red are provided. The full corrected article can be accessed in the ACM DL, DOI https://doi.org/10.1145/2501643.2501652 -Page 8: New Figure 6(a) -Page 16: New Figures 8(a), 8(b), and 9(a) -Page 17: New Figure 10(b) -Page 18: New Figures 11 and 12; corrected text reference -Page 19: Final sentence deleted", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2014:AGS, author = "Ying Zhang and Luming Zhang and Roger Zimmermann", title = "Aesthetics-Guided Summarization from Multiple User Generated Videos", journal = j-TOMM, volume = "11", number = "2", pages = "24:1--24:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2659520", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recent years, with the rapid development of camera technology and portable devices, we have witnessed a flourish of user generated videos, which are gradually reshaping the traditional professional video oriented media market. The volume of user generated videos in repositories is increasing at a rapid rate. In today's video retrieval systems, a simple query will return many videos which seriously increase the viewing burden. To manage these video retrievals and provide viewers with an efficient way to browse, we introduce a system to automatically generate a summarization from multiple user generated videos and present their salience to viewers in an enjoyable manner. Among multiple consumer videos, we find their qualities to be highly diverse due to various factors such as a photographer's experience or environmental conditions at the time of capture. Such quality inspires us to include a video quality evaluation component into the video summarization since videos with poor qualities can seriously degrade the viewing experience. We first propose a probabilistic model to evaluate the aesthetic quality of each user generated video. This model compares the rich aesthetics information from several well-known photo databases with generic unlabeled consumer videos, under a human perception component indicating the correlation between a video and its constituting frames. Subjective studies were carried out with the results indicating that our method is reliable. Then a novel graph-based formulation is proposed for the multi-video summarization task. Desirable summarization criteria is incorporated as the graph attributes and the problem is solved through a dynamic programming framework. Comparisons with several state-of-the-art methods demonstrate that our algorithm performs better than other methods in generating a skimming video in preserving the essential scenes from the original multiple input videos, with smooth transitions among consecutive segments and appealing aesthetics overall.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Calagari:2014:AAL, author = "Kiana Calagari and Mohammad Reza Pakravan and Shervin Shirmohammadi and Mohamed Hefeeda", title = "{ALP}: Adaptive Loss Protection Scheme with Constant Overhead for Interactive Video Applications", journal = j-TOMM, volume = "11", number = "2", pages = "25:1--25:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2656203", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "There has been an increasing demand for interactive video transmission over the Internet for applications such as video conferencing, video calls, and telepresence applications. These applications are increasingly moving towards providing High Definition (HD) video quality to users. A key challenge in these applications is to preserve the quality of video when it is transported over best-effort networks that do not guarantee lossless transport of video packets. In such conditions, it is important to protect the transmitted video by using intelligent and adaptive protection schemes. Applications such as HD video conferencing require live interaction among participants, which limits the overall delay the system can tolerate. Therefore, the protection scheme should add little or no extra delay to video transport. We propose a novel Adaptive Loss Protection (ALP) scheme for interactive HD video applications such as video conferencing and video chats. This scheme adds negligible delay to the transmission process and is shown to achieve better quality than other schemes in lossy networks. The proposed ALP scheme adaptively applies four different protection modes to cope with the dynamic network conditions, which results in high video quality in all network conditions. Our ALP scheme consists of four protection modes; each of these modes utilizes a protection method. Two of the modes rely on the state-of-the-art protection methods, and we propose a new Integrated Loss Protection (ILP) method for the other two modes. In the ILP method we integrate three factors for distributing the protection among packets. These three factors are error propagation, region of interest and header information. In order to decide when to switch between the protection modes, a new metric is proposed based on the effectiveness of each mode in performing protection, rather than just considering network statistics such as packet loss rate. Results show that by using this metric not only the overall quality will be improved but also the variance of quality will decrease. One of the main advantages of the proposed ALP scheme is that it does not increase the bit rate overhead in poor network conditions. Our results show a significant gain in video quality, up to 3dB PSNR improvement is achieved using our scheme, compared to protecting all packets equally with the same amount of overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ren:2014:BGO, author = "Dongni Ren and Yisheng Xu and S.-H. Gary Chan", title = "Beyond {1Mbps} Global Overlay Live Streaming: The Case of Proxy Helpers", journal = j-TOMM, volume = "11", number = "2", pages = "26:1--26:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2652485", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In order to provide live streaming over the global Internet, a content provider often deploys an overlay network consisting of distributed proxies placed close to user pools. Streaming of multi-Mbps video over such an overlay is challenging because of bandwidth bottlenecks in paths. To effectively overcome these bottlenecks, we consider employing proxy helpers in the overlay to provide rich path diversity. The helpers do not have any attached users, and hence may forward partial video streams (or not at all) if necessary. In this way, the helpers serve as stepping stones to supply full streams to the servers. The issue is how to involve the helpers in the overlay to achieve low streaming delay meeting a certain high streaming bitrate requirement. To address the issue, we first formulate the problem which captures various delay and bandwidth components, and show that it is NP-hard. We then propose an efficient algorithm called Stepping-Stones (SS) which can be efficiently implemented in a controller. Given the encouraging simulation results, we develop a novel streaming testbed for SS and explore, through sets of Internet experiments, the effectiveness of helpers to achieve high bitrate (multi-Mbps) global live streaming. In our experiments, proxies are deployed with a reasonably wide global footprint. We collect more than a hundred hours of streaming traces with bitrate ranging from 500kbps to a few Mbps. Our experimental data validates that helpers indeed play an important role in achieving high bitrate in today's Internet. Global multi-Mbps streaming is possible due to their multihop and multipath advantages. Our experimental trials and data also provide valuable insights on the design of a global push-based streaming network. There are strong benefits of using proxy helpers to achieve high bitrate and low delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Qian:2014:SEC, author = "Shengsheng Qian and Tianzhu Zhang and Changsheng Xu and M. Shamim Hossain", title = "Social Event Classification via Boosted Multimodal Supervised Latent {Dirichlet} Allocation", journal = j-TOMM, volume = "11", number = "2", pages = "27:1--27:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2659521", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapidly increasing popularity of social media sites (e.g., Flickr, YouTube, and Facebook), it is convenient for users to share their own comments on many social events, which successfully facilitates social event generation, sharing and propagation and results in a large amount of user-contributed media data (e.g., images, videos, and text) for a wide variety of real-world events of different types and scales. As a consequence, it has become more and more difficult to exactly find the interesting events from massive social media data, which is useful to browse, search and monitor social events by users or governments. To deal with these issues, we propose a novel boosted multimodal supervised Latent Dirichlet Allocation (BMM-SLDA) for social event classification by integrating a supervised topic model, denoted as multi-modal supervised Latent Dirichlet Allocation (mm-SLDA), in the boosting framework. Our proposed BMM-SLDA has a number of advantages. (1) Our mm-SLDA can effectively exploit the multimodality and the multiclass property of social events jointly, and make use of the supervised category label information to classify multiclass social event directly. (2) It is suitable for large-scale data analysis by utilizing boosting weighted sampling strategy to iteratively select a small subset of data to efficiently train the corresponding topic models. (3) It effectively exploits social event structure by the document weight distribution with classification error and can iteratively learn new topic model to correct the previously misclassified event documents. We evaluate our BMM-SLDA on a real world dataset and show extensive experimental results, which demonstrate that our model outperforms state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ye:2014:OBL, author = "Jun Ye and Kien A. Hua", title = "Octree-Based {$3$D} Logic and Computation of Spatial Relationships in Live Video Query Processing", journal = j-TOMM, volume = "11", number = "2", pages = "28:1--28:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2645864", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Live video computing (LVC) on distributed smart cameras has many important applications; and a database approach based on a Live Video DataBase Management System (LVDBMS) has shown to be effective for general LVC application development. The performance of such a database system relies on accurate interpretation of spatial relationships among objects in the live video. With the popularity of affordable depth cameras, 3D spatial computation techniques have been applied. However, the 3D object models currently used are expensive to compute, and offer limited scalability. We address this drawback in this article by proposing an octree-based 3D spatial logic and presenting algorithms for computing 3D spatial relationships using depth cameras. To support continuous query processing on live video streams, we also develop a GPU-based implementation of the proposed technique to further enhance scalability for real-time applications. Extensive performance studies based on a public RGB-D dataset as well as the LVDBMS prototype demonstrates the correctness and efficiency of our techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yin:2014:STT, author = "Yifang Yin and Zhijie Shen and Luming Zhang and Roger Zimmermann", title = "Spatial-Temporal Tag Mining for Automatic Geospatial Video Annotation", journal = j-TOMM, volume = "11", number = "2", pages = "29:1--29:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2658981", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Videos are increasingly geotagged and used in practical and powerful GIS applications. However, video search and management operations are typically supported by manual textual annotations, which are subjective and laborious. Therefore, research has been conducted to automate or semi-automate this process. Since a diverse vocabulary for video annotations is of paramount importance towards good search results, this article proposes to leverage crowdsourced data from social multimedia applications that host tags of diverse semantics to build a spatio-temporal tag repository, consequently acting as input to our auto-annotation approach. In particular, to build the tag store, we retrieve the necessary data from several social multimedia applications, mine both the spatial and temporal features of the tags, and then refine and index them accordingly. To better integrate the tag repository, we extend our previous approach by leveraging the temporal characteristics of videos as well. Moreover, we set up additional ranking criteria on the basis of tag similarity, popularity and location bias. Experimental results demonstrate that, by making use of such a tag repository, the generated tags have a wide range of semantics, and the resulting rankings are more consistent with human perception.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2014:LAM, author = "Chih-Wei Lin and Kuan-Wen Chen and Shen-Chi Chen and Cheng-Wu Chen and Yi-Ping Hung", title = "Large-Area, Multilayered, and High-Resolution Visual Monitoring Using a Dual-Camera System", journal = j-TOMM, volume = "11", number = "2", pages = "30:1--30:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2645862", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Large-area, high-resolution visual monitoring systems are indispensable in surveillance applications. To construct such systems, high-quality image capture and display devices are required. Whereas high-quality displays have rapidly developed, as exemplified by the announcement of the 85-inch 4K ultrahigh-definition TV by Samsung at the 2013 Consumer Electronics Show (CES), high-resolution surveillance cameras have progressed slowly and remain not widely used compared with displays. In this study, we designed an innovative framework, using a dual-camera system comprising a wide-angle fixed camera and a high-resolution pan-tilt-zoom (PTZ) camera to construct a large-area, multilayered, and high-resolution visual monitoring system that features multiresolution monitoring of moving objects. First, we developed a novel calibration approach to estimate the relationship between the two cameras and calibrate the PTZ camera. The PTZ camera was calibrated based on the consistent property of distinct pan-tilt angle at various zooming factors, accelerating the calibration process without affecting accuracy; this calibration process has not been reported previously. After calibrating the dual-camera system, we used the PTZ camera and synthesized a large-area and high-resolution background image. When foreground targets were detected in the images captured by the wide-angle camera, the PTZ camera was controlled to continuously track the user-selected target. Last, we integrated preconstructed high-resolution background and low-resolution foreground images captured using the wide-angle camera and the high-resolution foreground image captured using the PTZ camera to generate a large-area, multilayered, and high-resolution view of the scene.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Deng:2014:TFP, author = "Zhengyu Deng and Ming Yan and Jitao Sang and Changsheng Xu", title = "{Twitter} is Faster: Personalized Time-Aware Video Recommendation from {Twitter} to {YouTube}", journal = j-TOMM, volume = "11", number = "2", pages = "31:1--31:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637285", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Traditional personalized video recommendation methods focus on utilizing user profile or user history behaviors to model user interests, which follows a static strategy and fails to capture the swift shift of the short-term interests of users. According to our cross-platform data analysis, the information emergence and propagation is faster in social textual stream-based platforms than that in multimedia sharing platforms at micro user level. Inspired by this, we propose a dynamic user modeling strategy to tackle personalized video recommendation issues in the multimedia sharing platform YouTube, by transferring knowledge from the social textual stream-based platform Twitter. In particular, the cross-platform video recommendation strategy is divided into two steps. (1) Real-time hot topic detection: the hot topics that users are currently following are extracted from users' tweets, which are utilized to obtain the related videos in YouTube. (2) Time-aware video recommendation: for the target user in YouTube, the obtained videos are ranked by considering the user profile in YouTube, time factor, and quality factor to generate the final recommendation list. In this way, the short-term (hot topics) and long-term (user profile) interests of users are jointly considered. Carefully designed experiments have demonstrated the advantages of the proposed method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2014:SFV, author = "Yongtao Hu and Jan Kautz and Yizhou Yu and Wenping Wang", title = "Speaker-Following Video Subtitles", journal = j-TOMM, volume = "11", number = "2", pages = "32:1--32:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632111", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 7 17:48:10 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We propose a new method for improving the presentation of subtitles in video (e.g., TV and movies). With conventional subtitles, the viewer has to constantly look away from the main viewing area to read the subtitles at the bottom of the screen, which disrupts the viewing experience and causes unnecessary eyestrain. Our method places on-screen subtitles next to the respective speakers to allow the viewer to follow the visual content while simultaneously reading the subtitles. We use novel identification algorithms to detect the speakers based on audio and visual information. Then the placement of the subtitles is determined using global optimization. A comprehensive usability study indicated that our subtitle placement method outperformed both conventional fixed-position subtitling and another previous dynamic subtitling method in terms of enhancing the overall viewing experience and reducing eyestrain.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2015:ISI, author = "Kuan-Ta Chen and Songqing Chen and Wei Tsang Ooi", title = "Introduction to the Special Issue on {MMSys 2014} and {NOSSDAV 2014}", journal = j-TOMM, volume = "11", number = "2s", pages = "41:1--41:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2717509", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Schaber:2015:CAM, author = "Philipp Schaber and Stephan Kopf and Sina Wetzel and Tyler Ballast and Christoph Wesch and Wolfgang Effelsberg", title = "{CamMark}: Analyzing, Modeling, and Simulating Artifacts in Camcorder Copies", journal = j-TOMM, volume = "11", number = "2s", pages = "42:1--42:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700295", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "To support the development of any system that includes the generation and evaluation of camcorder copies, as well as to provide a common benchmark for robustness against camcorder copies, we present a tool to simulate digital video re-acquisition using a digital video camera. By resampling each video frame, we simulate the typical artifacts occurring in a camcorder copy: geometric modifications (aspect ratio changes, cropping, perspective and lens distortion), temporal sampling artifacts (due to different frame rates, shutter speeds, rolling shutters, or playback), spatial and color subsampling (rescaling, filtering, Bayer color filter array), and processing steps (automatic gain control, automatic white balance). We also support the simulation of camera movement (e.g., a hand-held camera) and background insertion. Furthermore, we allow for an easy setup and calibration of all the simulated artifacts, using sample/reference pairs of images and videos. Specifically temporal subsampling effects are analyzed in detail to create realistic frame blending artifacts in the simulated copies. We carefully evaluated our entire camcorder simulation system and found that the models we developed describe and match the real artifacts quite well.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Toni:2015:OSA, author = "Laura Toni and Ramon Aparicio-Pardo and Karine Pires and Gwendal Simon and Alberto Blanc and Pascal Frossard", title = "Optimal Selection of Adaptive Streaming Representations", journal = j-TOMM, volume = "11", number = "2s", pages = "43:1--43:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700294", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Adaptive streaming addresses the increasing and heterogeneous demand of multimedia content over the Internet by offering several encoded versions for each video sequence. Each version (or representation) is characterized by a resolution and a bit rate, and it is aimed at a specific set of users, like TV or mobile phone clients. While most existing works on adaptive streaming deal with effective playout-buffer control strategies on the client side, in this article we take a providers' perspective and propose solutions to improve user satisfaction by optimizing the set of available representations. We formulate an integer linear program that maximizes users' average satisfaction, taking into account network dynamics, type of video content, and user population characteristics. The solution of the optimization is a set of encoding parameters corresponding to the representations set that maximizes user satisfaction. We evaluate this solution by simulating multiple adaptive streaming sessions characterized by realistic network statistics, showing that the proposed solution outperforms commonly used vendor recommendations, in terms of user satisfaction but also in terms of fairness and outage probability. The simulation results show that video content information as well as network constraints and users' statistics play a crucial role in selecting proper encoding parameters to provide fairness among users and to reduce network resource usage. We finally propose a few theoretical guidelines that can be used, in realistic settings, to choose the encoding parameters based on the user characteristics, the network capacity and the type of video content.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2015:ADF, author = "Liang Chen and Yipeng Zhou and Dah Ming Chiu", title = "Analysis and Detection of Fake Views in Online Video Services", journal = j-TOMM, volume = "11", number = "2s", pages = "44:1--44:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700290", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online video-on-demand(VoD) services invariably maintain a view count for each video they serve, and it has become an important currency for various stakeholders, from viewers, to content owners, advertizers, and the online service providers themselves. There is often significant financial incentive to use a robot (or a botnet) to artificially create fake views. How can we detect fake views? Can we detect them (and stop them) efficiently? What is the extent of fake views with current VoD service providers? These are the questions we study in this article. We develop some algorithms and show that they are quite effective for this problem.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Song:2015:SVT, author = "Minseok Song and Yeongju Lee and Jinhan Park", title = "Scheduling a Video Transcoding Server to Save Energy", journal = j-TOMM, volume = "11", number = "2s", pages = "45:1--45:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700282", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent popular streaming services such as TV Everywhere, N-Screen, and dynamic adaptive streaming over HTTP (DASH) need to deliver content to the wide range of devices, requiring video content to be transcoded into different versions. Transcoding tasks require a lot of computation, and each task typically has its own real-time constraint. These make it difficult to manage transcoding, but the more efficient use of energy in servers is an imperative. We characterize transcoding workloads in terms of deadlines and computation times, and propose a new dynamic voltage and frequency scaling (DVFS) scheme that allocates a frequency and a workload to each CPU with the aim of minimizing power consumption while meeting all transcoding deadlines. This scheme has been simulated, and also implemented in a Linux transcoding server, in which a frontend node distributes transcoding requests to heterogeneous backend nodes. This required a new protocol for communication between nodes, a DVFS management scheme to reduce power consumption and thread management and scheduling schemes which ensure that transcoding deadlines are met. Power measurements show that this approach can reduce system-wide energy consumption by 17\% to 31\%, compared with the Linux Ondemand governor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Langroodi:2015:DCA, author = "Mohsen Jamali Langroodi and Joseph Peters and Shervin Shirmohammadi", title = "Decoder-Complexity-Aware Encoding of Motion Compensation for Multiple Heterogeneous Receivers", journal = j-TOMM, volume = "11", number = "2s", pages = "46:1--46:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700300", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For mobile multimedia systems, advances in battery technology have been much slower than those in memory, graphics, and processing power, making power consumption a major concern in mobile systems. The computational complexity of video codecs, which consists of CPU operations and memory accesses, is one of the main factors affecting power consumption. In this article, we propose a method that achieves near-optimal video quality while respecting user-defined bounds on the complexity needed to decode a video. We specifically focus on the motion compensation process, including motion vector prediction and interpolation, because it is the single largest component of computation-based power consumption. We start by formulating a scenario with a single receiver as a rate-distortion optimization problem and we develop an efficient decoder-complexity-aware video encoding method to solve it. Then we extend our approach to handle multiple heterogeneous receivers, each with a different complexity requirement. We test our method experimentally using the H.264 standard for the single receiver scenario and the H.264 SVC extension for the multiple receiver scenario. Our experimental results show that our method can achieve up to 97\% of the optimal solution value in the single receiver scenario, and an average of 97\% of the optimal solution value in the multiple receiver scenario. Furthermore, our tests with actual power measurements show a power saving of up to 23\% at the decoder when the complexity threshold is halved in the encoder.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2015:TAT, author = "Shannon Chen and Zhenhuan Gao and Klara Nahrstedt and Indranil Gupta", title = "{$3$DTI} Amphitheater: Towards {$3$DTI} Broadcasting", journal = j-TOMM, volume = "11", number = "2s", pages = "47:1--47:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700297", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 25 17:56:15 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "3DTI Amphitheater is a live broadcasting system for dissemination of 3DTI (3D Tele-immersive) content. The virtual environment constructed by the system mimics an amphitheater in the real world, where performers interact with each other in the central circular stage, and the audience is placed in virtual seats that surround the stage. Users of the Amphitheater can be geographically dispersed and the streams created by the performer sites are disseminated in a P2P network among the participants. To deal with the high bandwidth demand and strict latency bound of the service, we identify the hierarchical priority of streams in construction of the content dissemination forest. Result shows that the Amphitheater outperforms prior 3DTI systems by boosting the application QoS by a factor of 2.8 while sustaining the same hundred-scale audience group.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2015:PMV, author = "Ke Chen and Zhong Zhou and Wei Wu", title = "Progressive Motion Vector Clustering for Motion Estimation and Auxiliary Tracking", journal = j-TOMM, volume = "11", number = "3", pages = "33:1--33:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700296", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The motion vector similarity between neighboring blocks is widely used in motion estimation algorithms. However, for nonneighboring blocks, they may also have similar motions due to close depths or belonging to the same object inside the scene. Therefore, the motion vectors usually have several kinds of patterns, which reveal a clustering structure. In this article, we propose a progressive clustering algorithm, which periodically counts the motion vectors of the past blocks to make incremental clustering statistics. These statistics are used as the motion vector predictors for the following blocks. It is proved to be much more efficient for one block to find the best-matching candidate with the predictors. We also design the clustering based search with CUDA for GPU acceleration. Another interesting application of the clustering statistics is persistent static object tracking. Based on the statistics, several auxiliary tracking areas are created to guide the object tracking. Even when the target object has significant changes in appearance or it disappears occasionally, its position still can be predicted. The experiments on Xiph.org Video Test Media dataset illustrate that our clustering based search algorithm outperforms the mainstream and some state-of-the-art motion estimation algorithms. It is 33 times faster on average than the full search algorithm with only slightly higher mean-square error values in the experiments. The tracking results show that the auxiliary tracking areas help to locate the target object effectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2015:HFM, author = "Liquan Shen and Ping An and Zhaoyang Zhang and Qianqian Hu and Zhengchuan Chen", title = "A {$3$D--HEVC} Fast Mode Decision Algorithm for Real-Time Applications", journal = j-TOMM, volume = "11", number = "3", pages = "34:1--34:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700298", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "3D High Efficiency Video Coding (3D-HEVC) is an extension of the HEVC standard for coding of multiview videos and depth maps. It inherits the same quadtree coding structure as HEVC for both components, which allows recursively splitting into four equal-sized coding units (CU). One of 11 different prediction modes is chosen to code a CU in inter-frames. Similar to the joint model of H.264/AVC, the mode decision process in HM (reference software of HEVC) is performed using all the possible depth levels and prediction modes to find the one with the least rate distortion cost using a Lagrange multiplier. Furthermore, both motion estimation and disparity estimation need to be performed in the encoding process of 3D-HEVC. Those tools achieve high coding efficiency, but lead to a significant computational complexity. In this article, we propose a fast mode decision algorithm for 3D-HEVC. Since multiview videos and their associated depth maps represent the same scene, at the same time instant, their prediction modes are closely linked. Furthermore, the prediction information of a CU at the depth level X is strongly related to that of its parent CU at the depth level X-1 in the quadtree coding structure of HEVC since two corresponding CUs from two neighboring depth levels share similar video characteristics. The proposed algorithm jointly exploits the inter-view coding mode correlation, the inter-component (texture-depth) correlation and the inter-level correlation in the quadtree structure of 3D-HEVC. Experimental results show that our algorithm saves 66\% encoder runtime on average with only a 0.2\% BD-Rate increase on coded views and 1.3\% BD-Rate increase on synthesized views.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2015:BML, author = "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu and Ming-Hsuan Yang", title = "Boosted Multifeature Learning for Cross-Domain Transfer", journal = j-TOMM, volume = "11", number = "3", pages = "35:1--35:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700286", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Conventional learning algorithm assumes that the training data and test data share a common distribution. However, this assumption will greatly hinder the practical application of the learned model for cross-domain data analysis in multimedia. To deal with this issue, transfer learning based technology should be adopted. As a typical version of transfer learning, domain adaption has been extensively studied recently due to its theoretical value and practical interest. In this article, we propose a boosted multifeature learning (BMFL) approach to iteratively learn multiple representations within a boosting procedure for unsupervised domain adaption. The proposed BMFL method has a number of properties. (1) It reuses all instances with different weights assigned by the previous boosting iteration and avoids discarding labeled instances as in conventional methods. (2) It models the instance weight distribution effectively by considering the classification error and the domain similarity, which facilitates learning new feature representation to correct the previously misclassified instances. (3) It learns multiple different feature representations to effectively bridge the source and target domains. We evaluate the BMFL by comparing its performance on three applications: image classification, sentiment classification and spam filtering. Extensive experimental results demonstrate that the proposed BMFL algorithm performs favorably against state-of-the-art domain adaption methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2015:DVS, author = "Pei-Yu Lin", title = "Double Verification Secret Sharing Mechanism Based on Adaptive Pixel Pair Matching", journal = j-TOMM, volume = "11", number = "3", pages = "36:1--36:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700291", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Verifiability is essential for the secret sharing approach, which allows the involved participants to detect cheaters during the secret retrieval process. In this article, we propose a double verification secret sharing (DVSS) mechanism that can not only prevent fraudulent participants but also satisfy the requirements of secret payload, camouflage, image fidelity and lossless revealed secret. DVSS offers double verification process to enhance the cheater detectability; experimental results reveal that the designed scheme can share larger secret capacity and retain superior image quality than the related secret sharing methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2015:INB, author = "Shuang Wang and Shuqiang Jiang", title = "{INSTRE}: a New Benchmark for Instance-Level Object Retrieval and Recognition", journal = j-TOMM, volume = "11", number = "3", pages = "37:1--37:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700292", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Over the last several decades, researches on visual object retrieval and recognition have achieved fast and remarkable success. However, while the category-level tasks prevail in the community, the instance-level tasks (especially recognition) have not yet received adequate focuses. Applications such as content-based search engine and robot vision systems have alerted the awareness to bring instance-level tasks into a more realistic and challenging scenario. Motivated by the limited scope of existing instance-level datasets, in this article we propose a new benchmark for INSTance-level visual object REtrieval and REcognition (INSTRE). Compared with existing datasets, INSTRE has the following major properties: (1) balanced data scale, (2) more diverse intraclass instance variations, (3) cluttered and less contextual backgrounds, (4) object localization annotation for each image, (5) well-manipulated double-labelled images for measuring multiple object (within one image) case. We will quantify and visualize the merits of INSTRE data, and extensively compare them against existing datasets. Then on INSTRE, we comprehensively evaluate several popular algorithms to large-scale object retrieval problem with multiple evaluation metrics. Experimental results show that all the methods suffer a performance drop on INSTRE, proving that this field still remains a challenging problem. Finally we integrate these algorithms into a simple yet efficient scheme for recognition and compare it with classification-based methods. Importantly, we introduce the realistic multiobjects recognition problem. All experiments are conducted in both single object case and multiple objects case.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lathey:2015:IEE, author = "Ankita Lathey and Pradeep K. Atrey", title = "Image Enhancement in Encrypted Domain over Cloud", journal = j-TOMM, volume = "11", number = "3", pages = "38:1--38:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2656205", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud-based multimedia systems are becoming increasingly common. These systems offer not only storage facility, but also high-end computing infrastructure which can be used to process data for various analysis tasks ranging from low-level data quality enhancement to high-level activity and behavior identification operations. However, cloud data centers, being third party servers, are often prone to information leakage, raising security and privacy concerns. In this article, we present a Shamir's secret sharing based method to enhance the quality of encrypted image data over cloud. Using the proposed method we show that several image enhancement operations such as noise removal, antialiasing, edge and contrast enhancement, and dehazing can be performed in encrypted domain with near-zero loss in accuracy and minimal computation and data overhead. Moreover, the proposed method is proven to be information theoretically secure.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yin:2015:CVC, author = "Yifang Yin and Beomjoo Seo and Roger Zimmermann", title = "Content vs. Context: Visual and Geographic Information Use in Video Landmark Retrieval", journal = j-TOMM, volume = "11", number = "3", pages = "39:1--39:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700287", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Due to the ubiquity of sensor-equipped smartphones, it has become increasingly feasible for users to capture videos together with associated geographic metadata, for example the location and the orientation of the camera. Such contextual information creates new opportunities for the organization and retrieval of geo-referenced videos. In this study we explore the task of landmark retrieval through the analysis of two types of state-of-the-art techniques, namely media-content-based and geocontext-based retrievals. For the content-based method, we choose the Spatial Pyramid Matching (SPM) approach combined with two advanced coding methods: Sparse Coding (SC) and Locality-Constrained Linear Coding (LLC). For the geo-based method, we present the Geo Landmark Visibility Determination (GeoLVD) approach which computes the visibility of a landmark based on intersections of a camera's field-of-view (FOV) and the landmark's geometric information available from Geographic Information Systems (GIS) and services. We first compare the retrieval results of the two methods, and discuss the strengths and weaknesses of each approach in terms of precision, recall and execution time. Next we analyze the factors that affect the effectiveness for the content-based and the geo-based methods, respectively. Finally we propose a hybrid retrieval method based on the integration of the visual (content) and geographic (context) information, which is shown to achieve significant improvements in our experiments. We believe that the results and observations in this work will enlighten the design of future geo-referenced video retrieval systems, improve our understanding of selecting the most appropriate visual features for indexing and searching, and help in selecting between the most suitable methods for retrieval based on different conditions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2015:RCI, author = "Hong-Ying Yang and Xiang-Yang Wang and Pan-Pan Niu and Ai-Long Wang", title = "Robust Color Image Watermarking Using Geometric Invariant Quaternion Polar Harmonic Transform", journal = j-TOMM, volume = "11", number = "3", pages = "40:1--40:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700299", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 5 17:03:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "It is a challenging work to design a robust color image watermarking scheme against geometric distortions. Moments and moment invariants have become a powerful tool in robust image watermarking owing to their image description capability and geometric invariance property. However, the existing moment-based watermarking schemes were mainly designed for gray images but not for color images, and detection quality and robustness will be lowered when watermark is directly embedded into the luminance component or three color channels of color images. Furthermore, the imperceptibility of the embedded watermark is not well guaranteed. Based on algebra of quaternions and polar harmonic transform (PHT), we introduced the quaternion polar harmonic transform (QPHT) for invariant color image watermarking in this article, which can be seen as the generalization of PHT for gray-level images. It is shown that the QPHT can be obtained from the PHT of each color channel. We derived and analyzed the rotation, scaling, and translation (RST) invariant property of QPHT. We also discussed the problem of color image watermarking using QPHT. Experimental results are provided to illustrate the efficiency of the proposed color image watermarking against geometric distortions and common image processing operations (including color attacks).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Krishnappa:2015:CCV, author = "Dilip Kumar Krishnappa and Michael Zink and Carsten Griwodz and P{\aa}l Halvorsen", title = "Cache-Centric Video Recommendation: an Approach to Improve the Efficiency of {YouTube} Caches", journal = j-TOMM, volume = "11", number = "4", pages = "48:1--48:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2716310", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we take advantage of the user behavior of requesting videos from the top of the related list provided by YouTube to improve the performance of YouTube caches. We recommend that local caches reorder the related lists associated with YouTube videos, presenting the cached content above noncached content. We argue that the likelihood that viewers select content from the top of the related list is higher than selection from the bottom, and pushing contents already in the cache to the top of the related list would increase the likelihood of choosing cached content. To verify that the position on the list really is the selection criterion more dominant than the content itself, we conduct a user study with 40 YouTube-using volunteers who were presented with random related lists in their everyday YouTube use. After confirming our assumption, we analyze the benefits of our approach by an investigation that is based on two traces collected from a university campus. Our analysis shows that the proposed reordering approach for related lists would lead to a 2 to 5 times increase in cache hit rate compared to an approach without reordering the related list. This increase in hit rate would lead to reduction in server load and backend bandwidth usage, which in turn reduces the latency in streaming the video requested by the viewer and has the potential to improve the overall performance of YouTube's content distribution system. An analysis of YouTube's recommendation system reveals that related lists are created from a small pool of videos, which increases the potential for caching content from related lists and reordering based on the content in the cache.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2015:PMC, author = "Yu Zhang and James Z. Wang and Jia Li", title = "Parallel Massive Clustering of Discrete Distributions", journal = j-TOMM, volume = "11", number = "4", pages = "49:1--49:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700293", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The trend of analyzing big data in artificial intelligence demands highly-scalable machine learning algorithms, among which clustering is a fundamental and arguably the most widely applied method. To extend the applications of regular vector-based clustering algorithms, the Discrete Distribution (D2) clustering algorithm has been developed, aiming at clustering data represented by bags of weighted vectors which are well adopted data signatures in many emerging information retrieval and multimedia learning applications. However, the high computational complexity of D2-clustering limits its impact in solving massive learning problems. Here we present the parallel D2-clustering (PD2-clustering) algorithm with substantially improved scalability. We developed a hierarchical multipass algorithm structure for parallel computing in order to achieve a balance between the individual-node computation and the integration process of the algorithm. Experiments and extensive comparisons between PD2-clustering and other clustering algorithms are conducted on synthetic datasets. The results show that the proposed parallel algorithm achieves significant speed-up with minor accuracy loss. We apply PD2-clustering to image concept learning. In addition, by extending D2-clustering to symbolic data, we apply PD2-clustering to protein sequence clustering. For both applications, we demonstrate the high competitiveness of our new algorithm in comparison with other state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Baik:2015:EMR, author = "Eilwoo Baik and Amit Pande and Prasant Mohapatra", title = "Efficient {MAC} for Real-Time Video Streaming over Wireless {LAN}", journal = j-TOMM, volume = "11", number = "4", pages = "50:1--50:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2744412", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Wireless communication systems are highly prone to channel errors. With video being a major player in Internet traffic and undergoing exponential growth in wireless domain, we argue for the need of a Video-aware MAC (VMAC) to significantly improve the throughput and delay performance of real-time video streaming service. VMAC makes two changes to optimize wireless LAN for video traffic: (a) It incorporates a Perceptual-Error-Tolerance (PET) to the MAC frames by reducing MAC retransmissions while minimizing any impact on perceptual video quality; and (b) It uses a group NACK-based Adaptive Window (NAW) of MAC frames to improve both throughput and delay performance in varying channel conditions. Through simulations and experiments, we observe 56--89\% improvement in throughput and 34--48\% improvement in delay performance over legacy DCF and 802.11e schemes. VMAC also shows 15--78\% improvement over legacy schemes with multiple clients.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Antaris:2015:SSC, author = "Stefanos Antaris and Dimitrios Rafailidis", title = "Similarity Search over the Cloud Based on Image Descriptors' Dimensions Value Cardinalities", journal = j-TOMM, volume = "11", number = "4", pages = "51:1--51:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2716315", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recognition that in modern applications billions of images are stored into distributed databases in different logical or physical locations, we propose a similarity search strategy over the cloud based on the dimensions value cardinalities of image descriptors. Our strategy has low preprocessing requirements by dividing the computational cost of the preprocessing steps into several nodes over the cloud and locating the descriptors with similar dimensions value cardinalities logically close. New images are inserted into the distributed databases over the cloud efficiently, by supporting dynamical update in real-time. The proposed insertion algorithm has low computational complexity, depending exclusively on the dimensionality of descriptors and a small subset of descriptors with similar dimensions value cardinalities. Finally, an efficient query processing algorithm is proposed, where the dimensions of image descriptors are prioritized in the searching strategy, assuming that dimensions of high value cardinalities have more discriminative power than the dimensions of low ones. The computation effort of the query processing algorithm is divided into several nodes over the cloud infrastructure. In our experiments with seven publicly available datasets of image descriptors, we show that the proposed similarity search strategy outperforms competitive methods of single node, parallel and cloud-based architectures, in terms of preprocessing cost, search time and accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2015:AMD, author = "Yin-Tzu Lin and I-Ting Liu and Jyh-Shing Roger Jang and Ja-Ling Wu", title = "Audio Musical Dice Game: a User-Preference-Aware Medley Generating System", journal = j-TOMM, volume = "11", number = "4", pages = "52:1--52:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2710015", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes a framework for creating user-preference-aware music medleys from users' music collections. We treat the medley generation process as an audio version of a musical dice game. Once the user's collection has been analyzed, the system is able to generate various pleasing medleys. This flexibility allows users to create medleys according to the specified conditions, such as the medley structure or the must-use clips. Even users without musical knowledge can compose medley songs from their favorite tracks. The effectiveness of the system has been evaluated through both objective and subjective experiments on individual components in the system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2015:AVR, author = "Bo-Hao Chen and Shih-Chia Huang", title = "An Advanced Visibility Restoration Algorithm for Single Hazy Images", journal = j-TOMM, volume = "11", number = "4", pages = "53:1--53:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2726947", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Haze removal is the process by which horizontal obscuration is eliminated from hazy images captured during inclement weather. Images captured in natural environments with varied weather conditions frequently exhibit localized light sources or color-shift effects. The occurrence of these effects presents a difficult challenge for hazy image restoration, with which many traditional restoration methods cannot adequately contend. In this article, we present a new image haze removal approach based on Fisher's linear discriminant-based dual dark channel prior scheme in order to solve the problems associated with the presence of localized light sources and color shifts, and thereby achieve effective restoration. Experimental restoration results via qualitative and quantitative evaluations show that our proposed approach can provide higher haze-removal efficacy for images captured in varied weather conditions than can the other state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bao:2015:CPE, author = "Bing-Kun Bao and Changsheng Xu and Weiqing Min and Mohammod Shamim Hossain", title = "Cross-Platform Emerging Topic Detection and Elaboration from Multimedia Streams", journal = j-TOMM, volume = "11", number = "4", pages = "54:1--54:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2730889", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the explosive growth of online media platforms in recent years, it becomes more and more attractive to provide users a solution of emerging topic detection and elaboration. And this posts a real challenge to both industrial and academic researchers because of the overwhelming information available in multiple modalities and with large outlier noises. This article provides a method on emerging topic detection and elaboration using multimedia streams cross different online platforms. Specifically, Twitter, New York Times and Flickr are selected for the work to represent the microblog, news portal and imaging sharing platforms. The emerging keywords of Twitter are firstly extracted using aging theory. Then, to overcome the nature of short length message in microblog, Robust Cross-Platform Multimedia Co-Clustering (RCPMM-CC) is proposed to detect emerging topics with three novelties: (1) The data from different media platforms are in multimodalities; (2) The coclustering is processed based on a pairwise correlated structure, in which the involved three media platforms are pairwise dependent; (3) The noninformative samples are automatically pruned away at the same time of coclustering. In the last step of cross-platform elaboration, we enrich each emerging topic with the samples from New York Times and Flickr by computing the implicit links between social topics and samples from selected news and Flickr image clusters, which are obtained by RCPMM-CC. Qualitative and quantitative evaluation results demonstrate the effectiveness of our method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2015:QQG, author = "Yang Li and Azzedine Boukerche", title = "{QuGu}: a Quality Guaranteed Video Dissemination Protocol Over Urban Vehicular Ad Hoc Networks", journal = j-TOMM, volume = "11", number = "4", pages = "55:1--55:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2725469", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video dissemination over Vehicular Ad Hoc Networks is an attractive technology that supports many novel applications. The merit of this work lies in the design of an efficient video dissemination protocol that provides high video quality at different data rates for urban scenarios. Our objective is to improve received video quality while meeting delay and packet loss. In this work, we first employ a reliable scheme known as connected dominating set, which is an efficient receiver-based routing scheme for broadcasting video content. To avoid repeated computing of the connected dominating set, we add three statuses to each node. In nonscalable video coding, the distribution of lost frames can cause a major impact on video quality at the receiver's end. Therefore, for the second step, we employ Interleaving to spread out the burst losses and to reduce the influence of loss distributions. Although Interleaving can reduce the influence of cluster frame loss, single packet loss is also a concern due to collisions, and to intermittent disconnection in the topology. In order to fix these single packet losses, we propose a store-carry-forward scheme for the nodes in order to retransmit the local buffer stored packets. The results, when compared to the selected base protocols, show that our proposed protocol is an efficient solution for video dissemination over urban Vehicular Ad Hoc Networks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gaddam:2015:COM, author = "Vamsidhar Reddy Gaddam and Ragnhild Eg and Ragnar Langseth and Carsten Griwodz and P{\aa}l Halvorsen", title = "The Cameraman Operating My Virtual Camera is Artificial: Can the Machine Be as Good as a Human?", journal = j-TOMM, volume = "11", number = "4", pages = "56:1--56:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2744411", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we argue that the energy spent in designing autonomous camera control systems is not spent in vain. We present a real-time virtual camera system that can create smooth camera motion. Similar systems are frequently benchmarked with the human operator as the best possible reference; however, we avoid a priori assumptions in our evaluations. Our main question is simply whether we can design algorithms to steer a virtual camera that can compete with the user experience for recordings from an expert operator with several years of experience? In this respect, we present two low-complexity servoing methods that are explored in two user studies. The results from the user studies give a promising answer to the question pursued. Furthermore, all components of the system meet the real-time requirements on commodity hardware. The growing capabilities of both hardware and network in mobile devices give us hope that this system can be deployed to mobile users in the near future. Moreover, the design of the presented system takes into account that services to concurrent users must be supported.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Natarajan:2015:MCC, author = "Prabhu Natarajan and Pradeep K. Atrey and Mohan Kankanhalli", title = "Multi-Camera Coordination and Control in Surveillance Systems: a Survey", journal = j-TOMM, volume = "11", number = "4", pages = "57:1--57:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2710128", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 7 08:29:56 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The use of multiple heterogeneous cameras is becoming more common in today's surveillance systems. In order to perform surveillance tasks, effective coordination and control in multi-camera systems is very important, and is catching significant research attention these days. This survey aims to provide researchers with a state-of-the-art overview of various techniques for multi-camera coordination and control (MC$^3$) that have been adopted in surveillance systems. The existing literature on MC$^3$ is presented through several classifications based on the applicable architectures, frameworks and the associated surveillance tasks. Finally, a discussion on the open problems in surveillance area that can be solved effectively using MC$^3$ and the future directions in MC$^3$ research is presented", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{You:2015:UPD, author = "Shingchern D. You and Yi-Han Pu", title = "Using Paired Distances of Signal Peaks in Stereo Channels as Fingerprints for Copy Identification", journal = j-TOMM, volume = "12", number = "1", pages = "1:1--1:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742059", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes to use the relative distances between adjacent envelope peaks detected in stereo audio as fingerprints for copy identification. The matching algorithm used is the rough longest common subsequence (RLCS) algorithm. The experimental results show that the proposed approach has better identification accuracy than an MPEG-7 based scheme for distorted and noisy audio. When compared with other schemes, the proposed scheme uses fewer bits with comparable performance. The proposed fingerprints can also be used in conjunction with the MPEG-7 based scheme for lower computational burden.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{ElEssaili:2015:QBC, author = "Ali {El Essaili} and Zibin Wang and Eckehard Steinbach and Liang Zhou", title = "{QoE}-Based Cross-Layer Optimization for Uplink Video Transmission", journal = j-TOMM, volume = "12", number = "1", pages = "2:1--2:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2801124", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We study the problem of resource-efficient uplink distribution of user-generated video content over fourth-generation mobile networks. This is challenged by (1) the capacity-limited and time-variant uplink channel, (2) the resource-hungry upstreamed videos and their dynamically changing complexity, and (3) the different playout times of the video consumers. To address these issues, we propose a systematic approach for quality-of-experience (QoE)-based resource optimization and uplink transmission of multiuser generated video content. More specifically, we present an analytical model for distributed scalable video transmission at the mobile producers which considers these constraints. This is complemented by a multiuser cross-layer optimizer in the mobile network which determines the transmission capacity for each mobile terminal under current cell load and radio conditions. Both optimal and low-complexity solutions are presented. Simulation results for LTE uplink transmission show that significant gains in perceived video quality can be achieved by our cross-layer resource optimization scheme. In addition, the distributed optimization at the mobile producers can further improve the user experience across the different types of video consumers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2015:CSN, author = "Li-Jia Li and David A. Shamma and Xiangnan Kong and Sina Jafarpour and Roelof {Van Zwol} and Xuanhui Wang", title = "{CelebrityNet}: a Social Network Constructed from Large-Scale Online Celebrity Images", journal = j-TOMM, volume = "12", number = "1", pages = "3:1--3:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2801125", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Photos are an important information carrier for implicit relationships. In this article, we introduce an image based social network, called CelebrityNet, built from implicit relationships encoded in a collection of celebrity images. We analyze the social properties reflected in this image-based social network and automatically infer communities among the celebrities. We demonstrate the interesting discoveries of the CelebrityNet. We particularly compare the inferred communities with human manually labeled ones and show quantitatively that the automatically detected communities are highly aligned with that of human interpretation. Inspired by the uniqueness of visual content and tag concepts within each community of the CelebrityNet, we further demonstrate that the constructed social network can serve as a knowledge base for high-level visual recognition tasks. In particular, this social network is capable of significantly improving the performance of automatic image annotation and classification of unknown images.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2015:SDP, author = "Bo Zhang and Nicola Conci and Francesco G. B. {De Natale}", title = "Segmentation of Discriminative Patches in Human Activity Video", journal = j-TOMM, volume = "12", number = "1", pages = "4:1--4:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2750780", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we present a novel approach to segment discriminative patches in human activity videos. First, we adopt the spatio-temporal interest points (STIPs) to represent significant motion patterns in the video sequence. Then, nonnegative sparse coding is exploited to generate a sparse representation of each STIP descriptor. We construct the feature vector for each video by applying a two-stage sum-pooling and l$_2$ -normalization operation. After training a multi-class classifier through the error-correcting code SVM, the discriminative portion of each video is determined as the patch that has the highest confidence while also being correctly classified according to the video category. Experimental results show that the video patches extracted by our method are more separable, while preserving the perceptually relevant portion of each activity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2015:WMZ, author = "Hui Wang and Mun Choon Chan and Wei Tsang Ooi", title = "Wireless Multicast for Zoomable Video Streaming", journal = j-TOMM, volume = "12", number = "1", pages = "5:1--5:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2801123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Zoomable video streaming refers to a new class of interactive video applications, where users can zoom into a video stream to view a selected region of interest in higher resolutions and pan around to move the region of interest. The zoom and pan effects are typically achieved by breaking the source video into a grid of independently decodable tiles. Streaming the tiles to a set of heterogeneous users using broadcast is challenging, as users have different link rates and different regions of interest at different resolution levels. In this article, we consider the following problem: Given the subset of tiles that each user requested, the link rate of each user, and the available time slots, at which resolution should each tile be sent, to maximize the overall video quality received by all users. We design an efficient algorithm to solve this problem and evaluate the solution on a testbed using 10 mobile devices. Our method is able to achieve up to 12dB improvements over other heuristic methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bianco:2015:UPM, author = "Simone Bianco and Gianluigi Ciocca", title = "User Preferences Modeling and Learning for Pleasing Photo Collage Generation", journal = j-TOMM, volume = "12", number = "1", pages = "6:1--6:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2801126", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we consider how to automatically create pleasing photo collages created by placing a set of images on a limited canvas area. The task is formulated as an optimization problem. Differently from existing state-of-the-art approaches, we here exploit subjective experiments to model and learn pleasantness from user preferences. To this end, we design an experimental framework for the identification of the criteria that need to be taken into account to generate a pleasing photo collage. Five different thematic photo datasets are used to create collages using state-of-the-art criteria. A first subjective experiment where several subjects evaluated the collages, emphasizes that different criteria are involved in the subjective definition of pleasantness. We then identify new global and local criteria and design algorithms to quantify them. The relative importance of these criteria are automatically learned by exploiting the user preferences, and new collages are generated. To validate our framework, we performed several psycho-visual experiments involving different users. The results shows that the proposed framework allows to learn a novel computational model which effectively encodes an inter-user definition of pleasantness. The learned definition of pleasantness generalizes well to new photo datasets of different themes and sizes not used in the learning. Moreover, compared with two state-of-the-art approaches, the collages created using our framework are preferred by the majority of the users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Fu:2015:QBS, author = "Bo Fu and Dirk Staehle and Gerald Kunzmann and Eckehard Steinbach and Wolfgang Kellerer", title = "{QoE}-Based {SVC} Layer Dropping in {LTE} Networks Using Content-Aware Layer Priorities", journal = j-TOMM, volume = "12", number = "1", pages = "7:1--7:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2754167", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The increasing popularity of mobile video streaming applications has led to a high volume of video traffic in mobile networks. As the base station, for instance, the eNB in LTE networks, has limited physical resources, it can be overloaded by this traffic. This problem can be addressed by using Scalable Video Coding (SVC), which allows the eNB to drop layers of the video streams to dynamically adapt the bitrate. The impact of bitrate adaptation on the Quality of Experience (QoE) for the users depends on the content characteristics of videos. As the current mobile network architectures do not support the eNB in obtaining video content information, QoE optimization schemes with explicit signaling of content information have been proposed. These schemes, however, require the eNB or a specific optimization module to process the video content on the fly in order to extract the required information. This increases the computation and signaling overhead significantly, raising the OPEX for mobile operators. To address this issue, in this article, a content-aware (CA) priority marking and layer dropping scheme is proposed. The CA priority indicates a transmission order for the layers of all transmitted videos across all users, resulting from a comparison of their utility versus rate characteristics. The CA priority values can be determined at the P-GW on the fly, allowing mobile operators to control the priority marking process. Alternatively, they can be determined offline at the video servers, avoiding real-time computation in the core network. The eNB can perform content-aware SVC layer dropping using only the priority values. No additional content processing is required. The proposed scheme is lightweight both in terms of architecture and computation. The improvement in QoE is substantial and very close to the performance obtained with the computation and signaling-intensive QoE optimization schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2015:ASM, author = "Siqi Shen and Shun-Yun Hu and Alexandru Iosup and Dick Epema", title = "Area of Simulation: Mechanism and Architecture for Multi-Avatar Virtual Environments", journal = j-TOMM, volume = "12", number = "1", pages = "8:1--8:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2764463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Although Multi-Avatar Distributed Virtual Environments (MAVEs) such as Real-Time Strategy (RTS) games entertain daily hundreds of millions of online players, their current designs do not scale. For example, even popular RTS games such as the StarCraft series support in a single game instance only up to 16 players and only a few hundreds of avatars loosely controlled by these players, which is a consequence of the Event-Based Lockstep Simulation (EBLS) scalability mechanism they employ. Through empirical analysis, we show that a single Area of Interest (AoI), which is a scalability mechanism that is sufficient for single-avatar virtual environments (such as Role-Playing Games), also cannot meet the scalability demands of MAVEs. To enable scalable MAVEs, in this work we propose Area of Simulation (AoS), a new scalability mechanism, which combines and extends the mechanisms of AoI and EBLS. Unlike traditional AoI approaches, which employ only update-based operational models, our AoS mechanism uses both event-based and update-based operational models to manage not single, but multiple areas of interest. Unlike EBLS, which is traditionally used to synchronize the entire virtual world, our AoS mechanism synchronizes only selected areas of the virtual world. We further design an AoS-based architecture, which is able to use both our AoS and traditional AoI mechanisms simultaneously, dynamically trading-off consistency guarantees for scalability. We implement and deploy this architecture and we demonstrate that it can operate with an order of magnitude more avatars and a larger virtual world without exceeding the resource capacity of players' computers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lee:2015:LAR, author = "Suk Kyu Lee and Seungho Yoo and Jongtack Jung and Hwangnam Kim and Jihoon Ryoo", title = "Link-Aware Reconfigurable Point-to-Point Video Streaming for Mobile Devices", journal = j-TOMM, volume = "12", number = "1", pages = "9:1--9:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2771438", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Even though people of all social standings use current mobile devices in the wide spectrum of purpose from entertainment tools to communication means, some issues with real-time video streaming in hostile wireless environment still exist. In this article, we introduce CoSA, a link-aware real-time video streaming system for mobile devices. The proposed system utilizes a 3D camera to distinguish the region of importance (ROI) and non-ROI region within the video frame. Based on the link-state feedback from the receiver, the proposed system allocates a higher bandwidth for the region that is classified as ROI and a lower bandwidth for non-ROI in the video stream by reducing the video's bit rate. We implemented CoSA in a real test-bed where the IEEE 802.11 is employed as a medium for wireless networking. Furthermore, we verified the effectiveness of the proposed system by conducting a thorough empirical study. The results indicate that the proposed system enables real-time video streaming while maintaining a consistent visual quality by dynamically reconfiguring video coding parameters according to the link quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2015:CAM, author = "Ming-Ju Wu and Jyh-Shing R. Jang", title = "Combining Acoustic and Multilevel Visual Features for Music Genre Classification", journal = j-TOMM, volume = "12", number = "1", pages = "10:1--10:??", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2801127", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Aug 28 06:14:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Most music genre classification approaches extract acoustic features from frames to capture timbre information, leading to the common framework of bag-of-frames analysis. However, time-frequency analysis is also vital for modeling music genres. This article proposes multilevel visual features for extracting spectrogram textures and their temporal variations. A confidence-based late fusion is proposed for combining the acoustic and visual features. The experimental results indicated that the proposed method achieved an accuracy improvement of approximately 14\% and 2\% in the world's largest benchmark dataset (MASD) and Unique dataset, respectively. In particular, the proposed approach won the Music Information Retrieval Evaluation eXchange (MIREX) music genre classification contests from 2011 to 2013, demonstrating the feasibility and necessity of combining acoustic and visual features for classifying music genres.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{She:2015:ISI, author = "James She and Alvin Chin and Feng Xia and Jon Crowcroft", title = "Introduction to: Special Issue on {Smartphone}-Based Interactive Technologies, Systems, and Applications", journal = j-TOMM, volume = "12", number = "1s", pages = "11:1--11:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2820398", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhu:2015:SSB, author = "Biao Zhu and Hongxin Zhang and Wei Chen and Feng Xia and Ross Maciejewski", title = "{ShotVis}: {Smartphone}-Based Visualization of {OCR} Information from Images", journal = j-TOMM, volume = "12", number = "1s", pages = "12:1--12:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808210", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While visualization has been widely used as a data presentation tool in both desktop and mobile devices, the rapid visualization of information from images is still underexplored. In this work, we present a smartphone image acquisition and visualization approach for text-based data. Our prototype, ShotVis, takes images of text captured from mobile devices and extracts information for visualization. First, scattered characters in the text are processed and interactively reformulated to be stored as structured data (i.e., tables of numbers, lists of words, sentences). From there, ShotVis allows users to interactively bind visual forms to the underlying data and produce visualizations of the selected forms through touch-based interactions. In this manner, ShotVis can quickly summarize text from images into word clouds, scatterplots, and various other visualizations all through a simple click of the camera. In this way, ShotVis facilitates the interactive exploration of text data captured via cameras in smartphone devices. To demonstrate our prototype, several case studies are presented along with one user study to demonstrate the effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Venkatagiri:2015:ALG, author = "Seshadri Padmanabha Venkatagiri and Mun Choon Chan and Wei Tsang Ooi", title = "Automated Link Generation for Sensor-Enriched {Smartphone} Images", journal = j-TOMM, volume = "12", number = "1s", pages = "13:1--13:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808209", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The ubiquity of the smartphones makes them ideal platforms for generating in-situ content. In well-attended events, photos captured by attendees have diverse views that could be subjected to occlusion and abnormal lighting effects that could obscure the view. Such unstructured photo collections also have significant redundancy. Thus, a scene that is partially occluded or has bad contrast in one photo may be captured in another photo, possibly with higher details. We propose an application called Autolink that automatically establishes content-based links between sensor-annotated photos in unstructured photo collections captured using smartphones, such that users could navigate between high-context and high-detail images. This hierarchically structured image collection facilitates the design of applications for navigation and discovery, analytics about user photography patterns, user taste, and content/event popularity. Autolink includes a framework that constructs this hierarchy efficiently and with little content-specific training data by combining photo content processing with associated sensor logs obtained from multiple participants. We evaluated the performance of Autolink on two real-world sensor tagged photo datasets. The result shows that Autolink is able to efficiently cluster photos at 20 times faster than candidate algorithms, into the appropriate hierarchy with at least 70\% precision and 37\% better recall than candidate algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chu:2015:VCS, author = "Chung-Hua Chu", title = "Visual Comfort for Stereoscopic {$3$D} by Using Motion Sensors on {$3$D} Mobile Devices", journal = j-TOMM, volume = "12", number = "1s", pages = "14:1--14:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808211", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Advanced 3D mobile devices attract a lot of attentions for 3D visualization nowadays. Stereoscopic images and video taken from the 3D mobile devices are uncomfortable for 3D viewing experiences due to the limited hardware for stereoscopic 3D stabilization. The existing stereoscopic 3D stabilization methods are computationally inefficient for the 3D mobile devices. In this article, we point out that this critical issue deteriorates the 3D viewing experiences on the 3D mobile devices. To improve visual comfort, we propose an efficient and effective algorithm to stabilize the stereoscopic images and video for the 3D mobile devices. To rectify the video jitter, we use the gyroscope and accelerometer embedded on the mobile devices to obtain the geometry information of the cameras. Using a different method than video-content-based motion estimation, our algorithm based on the gyroscope and acceleration data can achieve higher accuracy to effectively stabilize the video. Therefore, our approach is robust in video stabilization even under poor lighting and substantial foreground motion. Our algorithm outperforms previous approaches in not only smaller running time but also the better comfort of the stereoscopic 3D visualization for the 3D mobile devices.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2015:ECA, author = "Kaikai Liu and Xiaolin Li", title = "Enabling Context-Aware Indoor Augmented Reality via {Smartphone} Sensing and Vision Tracking", journal = j-TOMM, volume = "12", number = "1s", pages = "15:1--15:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808208", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Augmented reality (AR) aims to render the world that users see and overlay information that reflects the real physical dynamics. The digital view could be potentially projected near the Point-of-Interest (POI) in a way that makes the virtual view attached to the POI even when the camera moves. Achieving smooth support for movements is a subject of extensive studies. One of the key problems is where the augmented information should be added to the field of vision in real time. Existing solutions either leverage GPS location for rendering outdoor AR views (hundreds of kilometers away) or rely on image markers for small-scale presentation (only for the marker region). To realize AR applications under various scales and dynamics, we propose a suite of algorithms for fine-grained AR view tracking to improve the accuracy of attitude and displacement estimation, reduce the drift, eliminate the marker, and lower the computation cost. Instead of requiring extremely high, accurate, absolute locations, we propose multimodal solutions according to mobility levels without additional hardware requirement. Experimental results demonstrate significantly less error in projecting and tracking the AR view. These results are expected to make users excited to explore their surroundings with enriched content.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ahn:2015:SHG, author = "Junho Ahn and James Williamson and Mike Gartrell and Richard Han and Qin Lv and Shivakant Mishra", title = "Supporting Healthy Grocery Shopping via Mobile Augmented Reality", journal = j-TOMM, volume = "12", number = "1s", pages = "16:1--16:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808207", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Augmented reality (AR) applications have recently become popular on modern smartphones. We explore the effectiveness of this mobile AR technology in the context of grocery shopping, in particular as a means to assist shoppers in making healthier decisions as they decide which grocery products to buy. We construct an AR-assisted mobile grocery-shopping application that makes real-time, customized recommendations of healthy products to users and also highlights products to avoid for various types of health concerns, such as allergies to milk or nut products, low-sodium or low-fat diets, and general caloric intake. We have implemented a prototype of this AR-assisted mobile grocery shopping application and evaluated its effectiveness in grocery store aisles. Our application's evaluation with typical grocery shoppers demonstrates that AR overlay tagging of products reduces the search time to find healthy food items, and that coloring the tags helps to improve the user's ability to quickly and easily identify recommended products, as well as products to avoid. We have evaluated our application's functionality by analyzing the data we collected from 15 in-person actual grocery-shopping subjects and 104 online application survey participants.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ma:2015:PUC, author = "Sixuan Ma and Zheng Yan", title = "{PSNController}: an Unwanted Content Control System in Pervasive Social Networking Based on Trust Management", journal = j-TOMM, volume = "12", number = "1s", pages = "17:1--17:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808206", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Pervasive social networking (PSN) supports online and instant social activities and communications in a universal and pervasive manner on the basis of heterogeneous networks. However, at the same time, when mobile users expect useful and valuable contents via PSN, they may also receive unwanted, unexpected, or even malicious contents. These contents may intrude user devices, occupy device memories, and irritate mobile users. Unwanted content control in PSN has become a crucial issue that impacts the success of PSN usage. Nowadays, the literature still lacks a robust and generic unwanted content control system that can be practically applied. In this article, we present the design and implementation of PSNController, an unwanted content control system in PSN based on trust management. We evaluate the system performance under a variety of intrusions and attacks. The result shows the system is effective with regard to accuracy, efficiency, and robustness. It can control unwanted contents in PSN according to trust evaluation. We further study user acceptance on PSNController prototype system based on a small-scale user study. We receive sound user feedback on PSNController with regard to perceived ease of use, perceived usefulness, interface design, playfulness, and acceptance attitude.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hao:2015:LEP, author = "Fei Hao and Mingjie Jiao and Geyong Min and Laurence T. Yang", title = "Launching an Efficient Participatory Sensing Campaign: a Smart Mobile Device-Based Approach", journal = j-TOMM, volume = "12", number = "1s", pages = "18:1--18:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808198", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Participatory sensing is a promising sensing paradigm that enables collection, processing, dissemination and analysis of the phenomena of interest by ordinary citizens through their handheld sensing devices. Participatory sensing has huge potential in many applications, such as smart transportation and air quality monitoring. However, participants may submit low-quality, misleading, inaccurate, or even malicious data if a participatory sensing campaign is not launched effectively. Therefore, it has become a significant issue to establish an efficient participatory sensing campaign for improving the data quality. This article proposes a novel five-tier framework of participatory sensing and addresses several technical challenges in this proposed framework including: (1) optimized deployment of data collection points (DC-points); and (2) efficient recruitment strategy of participants. Toward this end, the deployment of DC-points is formulated as an optimization problem with maximum utilization of sensor and then a Wise-Dynamic DC-points Deployment (WD3) algorithm is designed for high-quality sensing. Furthermore, to guarantee the reliable sensing data collection and communication, a trajectory-based strategy for participant recruitment is proposed to enable campaign organizers to identify well-suited participants for data sensing based on a joint consideration of temporal availability, trust, and energy. Extensive experiments and performance analysis of the proposed framework and associated algorithms are conducted. The results demonstrate that the proposed algorithm can achieve a good sensing coverage with a smaller number of DC-points, and the participants that are termed as social sensors are easily selected, to evaluate the feasibility and extensibility of the proposed recruitment strategies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rawat:2015:CAP, author = "Yogesh Singh Rawat and Mohan S. Kankanhalli", title = "Context-Aware Photography Learning for Smart Mobile Devices", journal = j-TOMM, volume = "12", number = "1s", pages = "19:1--19:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808199", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this work we have developed a photography model based on machine learning which can assist a user in capturing high quality photographs. As scene composition and camera parameters play a vital role in aesthetics of a captured image, the proposed method addresses the problem of learning photographic composition and camera parameters. Further, we observe that context is an important factor from a photography perspective, we therefore augment the learning with associated contextual information. The proposed method utilizes publicly available photographs along with social media cues and associated metainformation in photography learning. We define context features based on factors such as time, geolocation, environmental conditions and type of image, which have an impact on photography. We also propose the idea of computing the photographic composition basis, eigenrules and baserules, to support our composition learning. The proposed system can be used to provide feedback to the user regarding scene composition and camera parameters while the scene is being captured. It can also recommend position in the frame where people should stand for better composition. Moreover, it also provides camera motion guidance for pan, tilt and zoom to the user for improving scene composition.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Canazza:2015:ATM, author = "Sergio Canazza and Carlo Fantozzi and Niccol`o Pretto", title = "Accessing Tape Music Documents on Mobile Devices", journal = j-TOMM, volume = "12", number = "1s", pages = "20:1--20:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808200", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The aim of this article is to present and discuss an innovative methodology aimed at accessing digitized copies of historical tape music audio documents; the methodology leverages on the multimedia and multisensory capabilities of mobile devices to provide an unprecedented level of fruition. In addition to the methodology, and stemming from it, we present an actual software application for Android tablet devices. This novel piece of software was designed and developed in a multidisciplinary team involving engineers as well as musicians, composers, and archivists. The strongest element in our work is the fact that it follows a rigorous process and it is based on the principles of philological awareness; thus, it also takes into consideration the critical points in the musicologist's domain such as (i) the definition of preservation (i.e., master) copy, (ii) the importance of secondary information, (iii) the history of production and transmission of audio documents.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2015:SCC, author = "Xiping Hu and Junqi Deng and Jidi Zhao and Wenyan Hu and Edith C.-H. Ngai and Renfei Wang and Johnny Shen and Min Liang and Xitong Li and Victor C. M. Leung and Yu-Kwong Kwok", title = "{SAfeDJ}: a Crowd-Cloud Codesign Approach to Situation-Aware Music Delivery for Drivers", journal = j-TOMM, volume = "12", number = "1s", pages = "21:1--21:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808201", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Driving is an integral part of our everyday lives, but it is also a time when people are uniquely vulnerable. Previous research has demonstrated that not only does listening to suitable music while driving not impair driving performance, but it could lead to an improved mood and a more relaxed body state, which could improve driving performance and promote safe driving significantly. In this article, we propose SAfeDJ, a smartphone-based situation-aware music recommendation system, which is designed to turn driving into a safe and enjoyable experience. SAfeDJ aims at helping drivers to diminish fatigue and negative emotion. Its design is based on novel interactive methods, which enable in-car smartphones to orchestrate multiple sources of sensing data and the drivers' social context, in collaboration with cloud computing to form a seamless crowdsensing solution. This solution enables different smartphones to collaboratively recommend preferable music to drivers according to each driver's specific situations in an automated and intelligent manner. Practical experiments of SAfeDJ have proved its effectiveness in music-mood analysis, and mood-fatigue detections of drivers with reasonable computation and communication overheads on smartphones. Also, our user studies have demonstrated that SAfeDJ helps to decrease fatigue degree and negative mood degree of drivers by 49.09\% and 36.35\%, respectively, compared to traditional smartphone-based music player under similar driving situations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Baldauf:2015:ISG, author = "Matthias Baldauf and Peter Fr{\"o}hlich and Florence Adegeye and Stefan Suette", title = "Investigating On-Screen Gamepad Designs for {Smartphone}-Controlled Video Games", journal = j-TOMM, volume = "12", number = "1s", pages = "22:1--22:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808202", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "On-screen gamepads are increasingly used as controllers for video games on distant screens, yet lack the typical tactile feedback known from hardware controllers. We conducted a comparative lab study to investigate four smartphone gamepads inspired by traditional game controllers and mobile game controls (directional buttons, directional pad, floating joystick, tilt control). The study consisted of both completing a formal control test as well as controlling two popular video games of different genres (Pac-Man and Super Mario Bros.). The results indicate that the directional buttons require the most attention of the user, however, work precisely for direction-restricted navigational tasks. Directional pad and joystick showed a similar performance, yet they encourage drifting and unintended operations when the user is focused on the remote screen. While currently unfamiliar to many users, the floating joystick can reduce the glances at the device. Tilt turned out to be not sufficiently precise and quick for the investigated tasks. The article concludes with derived design guidelines with easily realizable measures for typical contexts such as casual gaming at home or spontaneous gaming on public displays.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bental:2015:SSL, author = "Diana S. Bental and Eliza Papadopoulou and Nicholas K. Taylor and M. Howard Williams and Fraser R. Blackmun and Idris S. Ibrahim and Mei Yii Lim and Ioannis Mimtsoudis and Stuart W. Whyte and Edel Jennings", title = "Smartening Up the Student Learning Experience with Ubiquitous Media", journal = j-TOMM, volume = "12", number = "1s", pages = "23:1--23:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808203", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article describes how an experimental platform for social, mobile and ubiquitous computing has been used in a wide-ranging longitudinal ``in the wild'' case study of the platform with a set of third-party services. The article outlines some of the relevant aspects of the platform, including built-in support for community formation, for context sensitivity, automated learning and adaptation to the user, and for management of privacy and trust relationships. The platform architecture is based on the notion of Cooperating Smart Spaces (CSSs), where a CSS is a partition of the platform corresponding to a single user and distributed over the devices belonging to that user. Three of the case study services were intended for use in a physical environment specifically created to support ubiquitous intelligence; they were highly interactive and used shared screens, voice input and gestural interaction. Another three ubiquitous services were available throughout the university environment as mobile and desktop services. The case study exploited this architecture's ability to integrate multiple novel applications and interface devices and to deliver them flexibly in these different environments. The platform proved to be stable and reliable and the study shows that treating a provider of services and resources (the University) as a CSS is instrumental in enabling the platform to provide this range of services across differing environments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hung:2015:ISI, author = "Hayley Hung and George Toderici", title = "Introduction to: Special Issue on Extended Best Papers from {ACM Multimedia 2014}", journal = j-TOMM, volume = "12", number = "1s", pages = "24:1--24:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2820400", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kim:2015:ERD, author = "Yelin Kim and Emily Mower Provost", title = "Emotion Recognition During Speech Using Dynamics of Multiple Regions of the Face", journal = j-TOMM, volume = "12", number = "1s", pages = "25:1--25:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808204", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The need for human-centered, affective multimedia interfaces has motivated research in automatic emotion recognition. In this article, we focus on facial emotion recognition. Specifically, we target a domain in which speakers produce emotional facial expressions while speaking. The main challenge of this domain is the presence of modulations due to both emotion and speech. For example, an individual's mouth movement may be similar when he smiles and when he pronounces the phoneme /IY/, as in ``cheese''. The result of this confusion is a decrease in performance of facial emotion recognition systems. In our previous work, we investigated the joint effects of emotion and speech on facial movement. We found that it is critical to employ proper temporal segmentation and to leverage knowledge of spoken content to improve classification performance. In the current work, we investigate the temporal characteristics of specific regions of the face, such as the forehead, eyebrow, cheek, and mouth. We present methodology that uses the temporal patterns of specific regions of the face in the context of a facial emotion recognition system. We test our proposed approaches on two emotion datasets, the IEMOCAP and SAVEE datasets. Our results demonstrate that the combination of emotion recognition systems based on different facial regions improves overall accuracy compared to systems that do not leverage different characteristics of individual regions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Feng:2015:CAC, author = "Fangxiang Feng and Xiaojie Wang and Ruifan Li and Ibrar Ahmad", title = "Correspondence Autoencoders for Cross-Modal Retrieval", journal = j-TOMM, volume = "12", number = "1s", pages = "26:1--26:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808205", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 21 16:37:02 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article considers the problem of cross-modal retrieval, such as using a text query to search for images and vice-versa. Based on different autoencoders, several novel models are proposed here for solving this problem. These models are constructed by correlating hidden representations of a pair of autoencoders. A novel optimal objective, which minimizes a linear combination of the representation learning errors for each modality and the correlation learning error between hidden representations of two modalities, is used to train the model as a whole. Minimizing the correlation learning error forces the model to learn hidden representations with only common information in different modalities, while minimizing the representation learning error makes hidden representations good enough to reconstruct inputs of each modality. To balance the two kind of errors induced by representation learning and correlation learning, we set a specific parameter in our models. Furthermore, according to the modalities the models attempt to reconstruct they are divided into two groups. One group including three models is named multimodal reconstruction correspondence autoencoder since it reconstructs both modalities. The other group including two models is named unimodal reconstruction correspondence autoencoder since it reconstructs a single modality. The proposed models are evaluated on three publicly available datasets. And our experiments demonstrate that our proposed correspondence autoencoders perform significantly better than three canonical correlation analysis based models and two popular multimodal deep models on cross-modal retrieval tasks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2016:SPS, author = "Longyu Zhang and Haiwei Dong and Abdulmotaleb {El Saddik}", title = "From {$3$D} Sensing to Printing: a Survey", journal = j-TOMM, volume = "12", number = "2", pages = "27:1--27:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818710", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Three-dimensional (3D) sensing and printing technologies have reshaped our world in recent years. In this article, a comprehensive overview of techniques related to the pipeline from 3D sensing to printing is provided. We compare the latest 3D sensors and 3D printers and introduce several sensing, postprocessing, and printing techniques available from both commercial deployments and published research. In addition, we demonstrate several devices, software, and experimental results of our related projects to further elaborate details of this process. A case study is conducted to further illustrate the possible tradeoffs during the process of this pipeline. Current progress, future research trends, and potential risks of 3D technologies are also discussed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Petrangeli:2016:QDR, author = "Stefano Petrangeli and Jeroen Famaey and Maxim Claeys and Steven Latr{\'e} and Filip {De Turck}", title = "{QoE}-Driven Rate Adaptation Heuristic for Fair Adaptive Video Streaming", journal = j-TOMM, volume = "12", number = "2", pages = "28:1--28:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818361", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "HTTP Adaptive Streaming (HAS) is quickly becoming the de facto standard for video streaming services. In HAS, each video is temporally segmented and stored in different quality levels. Rate adaptation heuristics, deployed at the video player, allow the most appropriate level to be dynamically requested, based on the current network conditions. It has been shown that today's heuristics underperform when multiple clients consume video at the same time, due to fairness issues among clients. Concretely, this means that different clients negatively influence each other as they compete for shared network resources. In this article, we propose a novel rate adaptation algorithm called FINEAS (Fair In-Network Enhanced Adaptive Streaming), capable of increasing clients' Quality of Experience (QoE) and achieving fairness in a multiclient setting. A key element of this approach is an in-network system of coordination proxies in charge of facilitating fair resource sharing among clients. The strength of this approach is threefold. First, fairness is achieved without explicit communication among clients and thus no significant overhead is introduced into the network. Second, the system of coordination proxies is transparent to the clients, that is, the clients do not need to be aware of its presence. Third, the HAS principle is maintained, as the in-network components only provide the clients with new information and suggestions, while the rate adaptation decision remains the sole responsibility of the clients themselves. We evaluate this novel approach through simulations, under highly variable bandwidth conditions and in several multiclient scenarios. We show how the proposed approach can improve fairness up to 80\% compared to state-of-the-art HAS heuristics in a scenario with three networks, each containing 30 clients streaming video at the same time.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sun:2016:SOR, author = "Shaoyan Sun and Wengang Zhou and Qi Tian and Houqiang Li", title = "Scalable Object Retrieval with Compact Image Representation from Generic Object Regions", journal = j-TOMM, volume = "12", number = "2", pages = "29:1--29:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818708", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In content-based visual object retrieval, image representation is one of the fundamental issues in improving retrieval performance. Existing works adopt either local SIFT-like features or holistic features, and may suffer sensitivity to noise or poor discrimination power. In this article, we propose a compact representation for scalable object retrieval from few generic object regions. The regions are identified with a general object detector and are described with a fusion of learning-based features and aggregated SIFT features. Further, we compress feature representation in large-scale image retrieval scenarios. We evaluate the performance of the proposed method on two public ground-truth datasets, with promising results. Experimental results on a million-scale image database demonstrate superior retrieval accuracy with efficiency gain in both computation and memory usage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ebrahim:2016:MIB, author = "Mansoor Ebrahim and Wai Chong Chia", title = "Multiview Image Block Compressive Sensing with Joint Multiphase Decoding for Visual Sensor Network", journal = j-TOMM, volume = "12", number = "2", pages = "30:1--30:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818712", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, a multiview image compression framework, which involves the use of Block-based Compressive Sensing (BCS) and Joint Multiphase Decoding (JMD), is proposed for a Visual Sensor Network (VSN). In the proposed framework, one of the sensor nodes is configured to serve as the reference node, the others as nonreference nodes. The images are encoded independently using the BCS to produce two observed measurements that are transmitted to the host workstation. In this case, the nonreference nodes always encoded the images (I$_{NR}$ ) at a lower subrate when compared with the images from the reference nodes (I$_R$ ). The idea is to improve the reconstruction of I$_{NR}$ using I$_R$. After the two observed measurements are received by the host workstation, they are first decoded independently, then image registration is applied to align I$_R$ onto the same plane of I$_{NR}$. The aligned I$_R$ is then fused with I$_{NR}$, using wavelets to produce the projected image I$_P$. Subsequently, the difference between the measurements of the I$_P$ and I$_{NR}$ is calculated. The difference is then decoded and added to I$_P$ to produce the final reconstructed I$_{NR}$. The simulation results show that the proposed framework is able to improve the quality of I$_{NR}$ on average by 2dB to 3dB at lower subrates when compared with other Compressive Sensing (CS)--based multiview image compression frameworks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pang:2016:OQA, author = "Lei Pang and Chong-Wah Ngo", title = "Opinion Question Answering by Sentiment Clip Localization", journal = j-TOMM, volume = "12", number = "2", pages = "31:1--31:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818711", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article considers multimedia question answering beyond factoid and how-to questions. We are interested in searching videos for answering opinion-oriented questions that are controversial and hotly debated. Examples of questions include ``Should Edward Snowden be pardoned?'' and ``Obamacare-unconstitutional or not?''. These questions often invoke emotional response, either positively or negatively, hence are likely to be better answered by videos than texts, due to the vivid display of emotional signals visible through facial expression and speaking tone. Nevertheless, a potential answer of duration 60s may be embedded in a video of 10min, resulting in degraded user experience compared to reading the answer in text only. Furthermore, a text-based opinion question may be short and vague, while the video answers could be verbal, less structured grammatically, and noisy because of errors in speech transcription. Direct matching of words or syntactic analysis of sentence structure, such as adopted by factoid and how-to question-answering, is unlikely to find video answers. The first problem, the answer localization, is addressed by audiovisual analysis of the emotional signals in videos for locating video segments likely expressing opinions. The second problem, questions and answers matching, is tackled by a deep architecture that nonlinearly matches text words in questions and speeches in videos. Experiments are conducted on eight controversial topics based on questions crawled from Yahoo! Answers and Internet videos from YouTube.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Papapanagiotou:2016:ICB, author = "Vasileios Papapanagiotou and Christos Diou and Anastasios Delopoulos", title = "Improving Concept-Based Image Retrieval with Training Weights Computed from Tags", journal = j-TOMM, volume = "12", number = "2", pages = "32:1--32:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2790230", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a novel approach to training classifiers for concept detection using tags and a variant of Support Vector Machine that enables the usage of training weights per sample. Combined with an appropriate tag weighting mechanism, more relevant samples play a more important role in the calibration of the final concept-detector model. We propose a complete, automated framework that (i) calculates relevance scores for each image-concept pair based on image tags, (ii) transforms the scores into relevance probabilities and automatically annotates each image according to this probability, (iii) transforms either the relevance scores or the probabilities into appropriate training weights and finally, (iv) incorporates the training weights and the visual features into a Fuzzy Support Vector Machine classifier to build the concept-detector model. The framework can be applied to online public collections, by gathering a large pool of diverse images, and using the calculated probability to select a training set and the associated training weights. To evaluate our argument, we experiment on two large annotated datasets. Experiments highlight the retrieval effectiveness of the proposed approach. Furthermore, experiments with various levels of annotation error show that using weights derived from tags significantly increases the robustness of the resulting concept detectors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2016:AGV, author = "Xuyong Yang and Tao Mei and Ying-Qing Xu and Yong Rui and Shipeng Li", title = "Automatic Generation of Visual-Textual Presentation Layout", journal = j-TOMM, volume = "12", number = "2", pages = "33:1--33:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818709", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Visual-textual presentation layout (e.g., digital magazine cover, poster, Power Point slides, and any other rich media), which combines beautiful image and overlaid readable texts, can result in an eye candy touch to attract users' attention. The designing of visual-textual presentation layout is therefore becoming ubiquitous in both commercially printed publications and online digital magazines. However, handcrafting aesthetically compelling layouts still remains challenging for many small businesses and amateur users. This article presents a system to automatically generate visual-textual presentation layouts by investigating a set of aesthetic design principles, through which an average user can easily create visually appealing layouts. The system is attributed with a set of topic-dependent layout templates and a computational framework integrating high-level aesthetic principles (in a top-down manner) and low-level image features (in a bottom-up manner). The layout templates, designed with prior knowledge from domain experts, define spatial layouts, semantic colors, harmonic color models, and font emotion and size constraints. We formulate the typography as an energy optimization problem by minimizing the cost of text intrusion, the utility of visual space, and the mismatch of information importance in perception and semantics, constrained by the automatically selected template and further preserving color harmonization. We demonstrate that our designs achieve the best reading experience compared with the reimplementation of parts of existing state-of-the-art designs through a series of user studies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2016:MCR, author = "Xuelong Li and Mulin Chen and Qi Wang", title = "Measuring Collectiveness via Refined Topological Similarity", journal = j-TOMM, volume = "12", number = "2", pages = "34:1--34:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2854000", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Crowd system has motivated a surge of interests in many areas of multimedia, as it contains plenty of information about crowd scenes. In crowd systems, individuals tend to exhibit collective behaviors, and the motion of all those individuals is called collective motion. As a comprehensive descriptor of collective motion, collectiveness has been proposed to reflect the degree of individuals moving as an entirety. Nevertheless, existing works mostly have limitations to correctly find the individuals of a crowd system and precisely capture the various relationships between individuals, both of which are essential to measure collectiveness. In this article, we propose a collectiveness-measuring method that is capable of quantifying collectiveness accurately. Our main contributions are threefold: (1) we compute relatively accurate collectiveness by making the tracked feature points represent the individuals more precisely with a point selection strategy; (2) we jointly investigate the spatial-temporal information of individuals and utilize it to characterize the topological relationship between individuals by manifold learning; (3) we propose a stability descriptor to deal with the irregular individuals, which influence the calculation of collectiveness. Intensive experiments on the simulated and real world datasets demonstrate that the proposed method is able to compute relatively accurate collectiveness and keep high consistency with human perception.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tyson:2016:MAM, author = "Gareth Tyson and Yehia Elkhatib and Nishanth Sastry and Steve Uhlig", title = "Measurements and Analysis of a Major Adult Video Portal", journal = j-TOMM, volume = "12", number = "2", pages = "35:1--35:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2854003", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Today, the Internet is a large multimedia delivery infrastructure, with websites such as YouTube appearing at the top of most measurement studies. However, most traffic studies have ignored an important domain: adult multimedia distribution. Whereas, traditionally, such services were provided primarily via bespoke websites, recently these have converged towards what is known as ``Porn 2.0''. These services allow users to upload, view, rate, and comment on videos for free (much like YouTube). Despite their scale, we still lack even a basic understanding of their operation. This article addresses this gap by performing a large-scale study of one of the most popular Porn 2.0 websites: YouPorn. Our measurements reveal a global delivery infrastructure that we have repeatedly crawled to collect statistics (on 183k videos). We use this data to characterise the corpus, as well as to inspect popularity trends and how they relate to other features, for example, categories and ratings. To explore our discoveries further, we use a small-scale user study, highlighting key system implications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Thomee:2016:FSP, author = "Bart Thomee and Ioannis Arapakis and David A. Shamma", title = "Finding Social Points of Interest from Georeferenced and Oriented Online Photographs", journal = j-TOMM, volume = "12", number = "2", pages = "36:1--36:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2854004", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 3 17:36:33 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Points of interest are an important requirement for location-based services, yet they are editorially curated and maintained, either professionally or through community. Beyond the laborious manual annotation task, further complications arise as points of interest may appear, relocate, or disappear over time, and may be relevant only to specific communities. To assist, complement, or even replace manual annotation, we propose a novel method for the automatic localization of points of interest depicted in photos taken by people across the world. Our technique exploits the geographic coordinates and the compass direction supplied by modern cameras, while accounting for possible measurement errors due to the variability in accuracy of the sensors that produced them. We statistically demonstrate that our method significantly outperforms techniques from the research literature on the task of estimating the geographic coordinates and geographic footprints of points of interest in various cities, even when photos are involved in the estimation process that do not show the point of interest at all.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{delBimbo:2016:PEC, author = "Alberto del Bimbo", title = "From the Past {Editor-In-Chief}", journal = j-TOMM, volume = "12", number = "3", pages = "37:1--37:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2903774", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37e", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2016:SPR, author = "Luming Zhang and Xuelong Li and Liqiang Nie and Yan Yan and Roger Zimmermann", title = "Semantic Photo Retargeting Under Noisy Image Labels", journal = j-TOMM, volume = "12", number = "3", pages = "37:1--37:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2886775", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the popularity of mobile devices, photo retargeting has become a useful technique that adapts a high-resolution photo onto a low-resolution screen. Conventional approaches are limited in two aspects. The first factor is the de-emphasized role of semantic content that is many times more important than low-level features in photo aesthetics. Second is the importance of image spatial modeling: toward a semantically reasonable retargeted photo, the spatial distribution of objects within an image should be accurately learned. To solve these two problems, we propose a new semantically aware photo retargeting that shrinks a photo according to region semantics. The key technique is a mechanism transferring semantics of noisy image labels (inaccurate labels predicted by a learner like an SVM) into different image regions. In particular, we first project the local aesthetic features (graphlets in this work) onto a semantic space, wherein image labels are selectively encoded according to their noise level. Then, a category-sharing model is proposed to robustly discover the semantics of each image region. The model is motivated by the observation that the semantic distribution of graphlets from images tagged by a common label remains stable in the presence of noisy labels. Thereafter, a spatial pyramid is constructed to hierarchically encode the spatial layout of graphlet semantics. Based on this, a probabilistic model is proposed to enforce the spatial layout of a retargeted photo to be maximally similar to those from the training photos. Experimental results show that (1) noisy image labels predicted by different learners can improve the retargeting performance, according to both qualitative and quantitative analysis, and (2) the category-sharing model stays stable even when 32.36\% of image labels are incorrectly predicted.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhou:2016:MDD, author = "Liang Zhou", title = "Mobile Device-to-Device Video Distribution: Theory and Application", journal = j-TOMM, volume = "12", number = "3", pages = "38:1--38:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2886776", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As video traffic has dominated the data flow of smartphones, traditional cellular communications face substantial transmission challenges. In this work, we study mobile device-to-device (D2D) video distribution that leverages the storage and communication capacities of smartphones. In such a mobile distributed framework, D2D communication represents an opportunistic process to selectively store and transmit local videos to meet the future demand of others. The performance is measured by the service time, which denotes the elapsed period for fulfilling the demand, and the corresponding implementation of each device depends on the video's demand, availability, and size. The main contributions of this work lie in (1) considering the impact of video size in a practical mobile D2D video distribution scenario and proposing a general global estimation of the video distribution based on limited and local observations; (2) designing a purely distributed D2D video distribution scheme without the monitoring of any central controller; and (3) providing a practical implementation of the scheme, which does not need to know the video availability, user demand, and device mobility. Numerical results have demonstrated the efficiency and robustness of the proposed scheme.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ravi:2016:FAL, author = "Hareesh Ravi and A. V. Subramanyam and Sabu Emmanuel", title = "Forensic Analysis of Linear and Nonlinear Image Filtering Using Quantization Noise", journal = j-TOMM, volume = "12", number = "3", pages = "39:1--39:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2857069", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The availability of intelligent image editing techniques and antiforensic algorithms, make it convenient to manipulate an image and to hide the artifacts that it might have produced in the process. Real world forgeries are generally followed by the application of enhancement techniques such as filtering and/or conversion of the image format to suppress the forgery artifacts. Though several techniques evolved in the direction of detecting some of these manipulations, additional operations like recompression, nonlinear filtering, and other antiforensic methods during forgery are not deeply investigated. Toward this, we propose a robust method to detect whether a given image has undergone filtering (linear or nonlinear) based enhancement, possibly followed by format conversion after forgery. In the proposed method, JPEG quantization noise is obtained using natural image prior and quantization noise models. Transition probability features extracted from the quantization noise are used for machine learning based detection and classification. We test the effectiveness of the algorithm in classifying the class of the filter applied and the efficacy in detecting filtering in low resolution images. Experiments are performed to compare the performance of the proposed technique with state-of-the-art forensic filtering detection algorithms. It is found that the proposed technique is superior in most of the cases. Also, experiments against popular antiforensic algorithms show the counter antiforensic robustness of the proposed technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2016:SND, author = "Xianjun Hu and Weiming Zhang and Ke Li and Honggang Hu and Nenghai Yu", title = "Secure Nonlocal Denoising in Outsourced Images", journal = j-TOMM, volume = "12", number = "3", pages = "40:1--40:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2886777", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Signal processing in the encrypted domain becomes a desired technique to protect privacy of outsourced data in cloud. In this article, we propose a double-cipher scheme to implement nonlocal means (NLM) denoising in encrypted images. In this scheme, one ciphertext is generated by the Paillier scheme, which enables the mean filter, and the other is obtained by a privacy-preserving transform, which enables the nonlocal search. By the privacy-preserving transform, the cloud server can search the similar pixel blocks in the ciphertexts with the same speed as in the plaintexts; thus, the proposed method can be executed fast. To enhance the security, we randomly permutate both ciphertexts. To reduce the denoising complexity caused by random permutation, a random NLM method is exploited in the encrypted domain. The experimental results show that the quality of denoised images in the encrypted domain is comparable to that obtained in the plain domain.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Calagari:2016:DPS, author = "Kiana Calagari and Tarek Elgamal and Khaled Diab and Krzysztof Templin and Piotr Didyk and Wojciech Matusik and Mohamed Hefeeda", title = "Depth Personalization and Streaming of Stereoscopic Sports Videos", journal = j-TOMM, volume = "12", number = "3", pages = "41:1--41:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2890103", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current three-dimensional displays cannot fully reproduce all depth cues used by a human observer in the real world. Instead, they create only an illusion of looking at a three-dimensional scene. This leads to a number of challenges during the content creation process. To assure correct depth reproduction and visual comfort, either the acquisition setup has to be carefully controlled or additional postprocessing techniques have to be applied. Furthermore, these manipulations need to account for a particular setup that is used to present the content, for example, viewing distance or screen size. This creates additional challenges in the context of personal use when stereoscopic content is shown on TV sets, desktop monitors, or mobile devices. We address this problem by presenting a new system for streaming stereoscopic content. Its key feature is a computationally efficient depth adjustment technique which can automatically optimize viewing experience for videos of field sports such as soccer, football, and tennis. Additionally, the method enables depth personalization to allow users to adjust the amount of depth according to their preferences. Our stereoscopic video streaming system was implemented, deployed, and tested with real users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2016:ERM, author = "Qiong Wu and Pierre Boulanger", title = "Enhanced Reweighted {MRFs} for Efficient Fashion Image Parsing", journal = j-TOMM, volume = "12", number = "3", pages = "42:1--42:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2890104", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Previous image parsing methods usually model the problem in a conditional random field which describes a statistical model learned from a training dataset and then processes a query image using the conditional probability. However, for clothing images, fashion items have a large variety of layering and configuration, and it is hard to learn a certain statistical model of features that apply to general cases. In this article, we take fashion images as an example to show how Markov Random Fields (MRFs) can outperform Conditional Random Fields when the application does not follow a certain statistical model learned from the training data set. We propose a new method for automatically parsing fashion images in high processing efficiency with significantly less training time by applying a modification of MRFs, named reweighted MRF (RW-MRF), which resolves the problem of over smoothing infrequent labels. We further enhance RW-MRF with occlusion prior and background prior to resolve two other common problems in clothing parsing, occlusion, and background spill. Our experimental results indicate that our proposed clothing parsing method significantly improves processing time and training time over state-of-the-art methods, while ensuring comparable parsing accuracy and improving label recall rate.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2016:ADA, author = "Yao Hu and Chen Zhao and Deng Cai and Xiaofei He and Xuelong Li", title = "Atom Decomposition with Adaptive Basis Selection Strategy for Matrix Completion", journal = j-TOMM, volume = "12", number = "3", pages = "43:1--43:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2903716", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Estimating missing entries in matrices has attracted much attention due to its wide range of applications like image inpainting and video denoising, which are usually considered as low-rank matrix completion problems theoretically. It is common to consider nuclear norm as a surrogate of the rank operator since it is the tightest convex lower bound of the rank operator under certain conditions. However, most approaches based on nuclear norm minimization involve a number of singular value decomposition (SVD) operations. Given a matrix $ X \in R^{m \times n} $, the time complexity of the SVD operation is $ O(m n^2) $, which brings prohibitive computational burden on large-scale matrices, limiting the further usage of these methods in real applications. Motivated by this observation, a series of atom-decomposition-based matrix completion methods have been studied. The key to these methods is to reconstruct the target matrix by pursuit methods in a greedy way, which only involves the computation of the top SVD and has great advantages in efficiency compared with the SVD-based matrix completion methods. However, due to gradually serious accumulation errors, atom-decomposition-based methods usually result in unsatisfactory reconstruction accuracy. In this article, we propose a new efficient and scalable atom decomposition algorithm for matrix completion called Adaptive Basis Selection Strategy (ABSS). Different from traditional greedy atom decomposition methods, a two-phase strategy is conducted to generate the basis separately via different strategies according to their different nature. At first, we globally prune the basis space to eliminate the unimportant basis as much as possible and locate the probable subspace containing the most informative basis. Then, another group of basis spaces are learned to improve the recovery accuracy based on local information. In this way, our proposed algorithm breaks through the accuracy bottleneck of traditional atom-decomposition-based matrix completion methods; meanwhile, it reserves the innate efficiency advantages over SVD-based matrix completion methods. We empirically evaluate the proposed algorithm ABSS on real visual image data and large-scale recommendation datasets. Results have shown that ABSS has much better reconstruction accuracy with comparable cost to atom-decomposition-based methods. At the same time, it outperforms the state-of-the-art SVD-based matrix completion algorithms by similar or better reconstruction accuracy with enormous advantages on efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Miao:2016:HFL, author = "Dan Miao and Jingjing Fu and Yan Lu and Shipeng Li and Chang Wen Chen", title = "A High-Fidelity and Low-Interaction-Delay Screen Sharing System", journal = j-TOMM, volume = "12", number = "3", pages = "44:1--44:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2897395", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The pervasive computing environment and wide network bandwidth provide users more opportunities to share screen content among multiple devices. In this article, we introduce a remote display system to enable screen sharing among multiple devices with high fidelity and responsive interaction. In the developed system, the frame-level screen content is compressed and transmitted to the client side for screen sharing, and the instant control inputs are simultaneously transmitted to the server side for interaction. Even if the screen responds immediately to the control messages and updates at a high frame rate on the server side, it is difficult to update the screen content with low delay and high frame rate in the client side due to non-negligible time consumption on the whole screen frame compression, transmission, and display buffer updating. To address this critical problem, we propose a layered structure for screen coding and rendering to deliver diverse screen content to the client side with an adaptive frame rate. More specifically, the interaction content with small region screen update is compressed by a blockwise screen codec and rendered at a high frame rate to achieve smooth interaction, while the natural video screen content is compressed by standard video codec and rendered at a regular frame rate for a smooth video display. Experimental results with real applications demonstrate that the proposed system can successfully reduce transmission bandwidth cost and interaction delay during screen sharing. Especially for user interaction in small regions, the proposed system can achieve a higher frame rate than most previous counterparts.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wilk:2016:CAV, author = "Stefan Wilk and Stephan Kopf and Wolfgang Effelsberg", title = "Collaborative Annotation of Videos Relying on Weak Consistency", journal = j-TOMM, volume = "12", number = "3", pages = "45:1--45:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2907983", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This work discusses a distributed interactive video system that supports video annotation using simultaneous hyperlinking by multiple users. The users mark and annotate objects within the video with links to other media such as text, images, websites, or other videos. Annotations are visualized on the client user interface as an overlay close to the objects. Our system is intuitive to use; for example, it contains automatic object-tracking functionality that correctly positions the annotations, even when the form or location of an object changes. Thus, our first contribution discusses the adaptive object-tracking algorithm used for this repositioning. It shows improved precision and reliability in comparison to nonadaptive algorithms. A second key issue is to keep the system responsive when the number of concurrent annotators increases. Thus, we rely on the concept of eventual consistency between different network entities. While this weak form of consistency allows temporary inconsistencies, it ensures that a consistent state can be reached. Thus, the second contribution is the design and evaluation of our distributed interactive video system, which relies on the weak consistency paradigm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Merani:2016:ASP, author = "Maria Luisa Merani and Laura Natali", title = "Adaptive Streaming in {P2P} Live Video Systems: a Distributed Rate Control Approach", journal = j-TOMM, volume = "12", number = "3", pages = "46:1--46:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2912123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 16 09:38:16 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is a recently proposed standard that offers different versions of the same media content to adapt the delivery process over the Internet to dynamic bandwidth fluctuations and different user device capabilities. The peer-to-peer (P2P) paradigm for video streaming allows us to leverage the cooperation among peers, guaranteeing the service of video requests with increased scalability and reduced cost. We propose to combine these two approaches in a P2P-DASH architecture, exploiting the potentiality of both. The new platform is made of several swarms and a different DASH representation is streamed within each of them; unlike client-server DASH architectures, where each client autonomously selects which version to download according to current network conditions and to its device resources, we put forth a new rate control strategy implemented at peer site to maintain a good viewing quality to the local user and to simultaneously guarantee the successful operation of the P2P swarms. The effectiveness of the solution is demonstrated through simulation and it indicates that the P2P-DASH platform is able to provide its users with very good performance, much more satisfying than in a conventional P2P environment where DASH is not employed. Through a comparison with a reference DASH system modeled via the Integer Linear Programming (ILP) approach, the new system is shown to outperform such reference architecture. To further validate the proposal, in terms of both robustness and scalability, system behavior is investigated in the critical condition of a flash crowd, showing that the strong upsurge of new users can be successfully revealed and gradually accommodated.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jia:2016:WGB, author = "Adele Lu Jia and Siqi Shen and Dick H. J. Epema and Alexandru Iosup", title = "When Game Becomes Life: The Creators and Spectators of Online Game Replays and Live Streaming", journal = j-TOMM, volume = "12", number = "4", pages = "47:1--47:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online gaming franchises such as World of Tanks, Defense of the Ancients, and StarCraft have attracted hundreds of millions of users who, apart from playing the game, also socialize with each other through gaming and viewing gamecasts. As a form of User Generated Content (UGC), gamecasts play an important role in user entertainment and gamer education. They deserve the attention of both industrial partners and the academic communities, corresponding to the large amount of revenue involved and the interesting research problems associated with UGC sites and social networks. Although previous work has put much effort into analyzing general UGC sites such as YouTube, relatively little is known about the gamecast sharing sites. In this work, we provide the first comprehensive study of gamecast sharing sites, including commercial streaming-based sites such as Amazon's Twitch.tv and community-maintained replay-based sites such as WoTreplays. We collect and share a novel dataset on WoTreplays that includes more than 380,000 game replays, shared by more than 60,000 creators with more than 1.9 million gamers. Together with an earlier published dataset on Twitch.tv, we investigate basic characteristics of gamecast sharing sites, and we analyze the activities of their creators and spectators. Among our results, we find that (i) WoTreplays and Twitch.tv are both fast-consumed repositories, with millions of gamecasts being uploaded, viewed, and soon forgotten; (ii) both the gamecasts and the creators exhibit highly skewed popularity, with a significant heavy tail phenomenon; and (iii) the upload and download preferences of creators and spectators are different: while the creators emphasize their individual skills, the spectators appreciate team-wise tactics. Our findings provide important knowledge for infrastructure and service improvement, for example, in the design of proper resource allocation mechanisms that consider future gamecasting and in the tuning of incentive policies that further help player retention.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Rana:2016:DBV, author = "Shuvendu Rana and Arijit Sur", title = "Depth-Based View-Invariant Blind {$3$D} Image Watermarking", journal = j-TOMM, volume = "12", number = "4", pages = "48:1--48:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957751", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the huge advance in Internet technology as well as the availability of low-cost 3D display devices, 3D image transmission has become popular in recent times. Since watermarking has become regarded as a potential Digital Rights Management (DRM) tools in the past decade, 3D image watermarking is an emerging research topic. With the introduction of the Depth Image-Based Rendering (DIBR) technique, 3D image watermarking is a more challenging task, especially for synthetic view generation. In this article, synthetic view generation is regarded as a potential attack, and a blind watermarking scheme is proposed that can resist it. In the proposed scheme, the watermark is embedded into the low-pass filtered dependent view region of 3D images. Block Discrete Cosine Transformation (DCT) is used for spatial-filtration of the dependent view region to find the DC coefficient with horizontally shifted coherent regions from the left and right view to make the scheme robust against synthesis view attack. A comprehensive set of experiments have been carried out to justify the robustness of the proposed scheme over related existing schemes with respect to Stereo JPEG compression and different noise addition attacks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Silva:2016:MIB, author = "Bruno M. C. Silva and Joel J. P. C. Rodrigues and Neeraj Kumar Mario L. {Proen{\c{c}}a, Jr.} and Guangjie Han", title = "{MobiCoop}: an Incentive-Based Cooperation Solution for Mobile Applications", journal = j-TOMM, volume = "12", number = "4", pages = "49:1--49:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957752", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Network architectures based on mobile devices and wireless communications present several constraints (e.g., processor, energy storage, bandwidth, etc.) that affect the overall network performance. Cooperation strategies have been considered as a solution to address these network limitations. In the presence of unstable network infrastructures, mobile nodes cooperate with each other, forwarding data and performing other specific network functionalities. This article proposes a generalized incentive-based cooperation solution for mobile services and applications called MobiCoop. This reputation-based scheme includes an application framework for mobile applications that uses a Web service to handle all the nodes reputation and network permissions. The main goal of MobiCoop is to provide Internet services to mobile devices without network connectivity through cooperation with neighbor devices. The article includes a performance evaluation study of MobiCoop considering both a real scenario (using a prototype) and a simulation-based study. Results show that the proposed approach provides network connectivity independency to users with mobile apps when Internet connectivity is unavailable. Then, it is concluded that MobiCoop improved significantly the overall system performance and the service provided for a given mobile application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shivani:2016:PVC, author = "Shivendra Shivani and Suneeta Agarwal", title = "Progressive Visual Cryptography with Unexpanded Meaningful Shares", journal = j-TOMM, volume = "12", number = "4", pages = "50:1--50:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2935618", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The traditional $k$-out-of-$n$ Visual Cryptography (VC) scheme is the conception of ``all or nothing'' for $n$ participants to share a secret image. The original secret image can be visually revealed only when a subset of $k$ or more shares are superimposed together, but if the number of stacked shares are less than $k$, nothing will be revealed. On the other hand, a Progressive Visual Cryptography (PVC) scheme differs from the traditional VC with respect to decoding. In PVC, clarity and contrast of the decoded secret image will be increased progressively with the number of stacked shares. Much of the existing state-of-the-art research on PVC has problems with pixel expansion and random pattern of the shares. In this article, a novel scheme of progressive visual cryptography with four or more number of unexpanded as well as meaningful shares has been proposed. For this, a novel and efficient Candidate Block Replacement preprocessing approach and a basis matrix creation algorithm have also been introduced. The proposed method also eliminates many unnecessary encryption constraints like a predefined codebook for encoding and decoding the secret image, restriction on the number of participants, and so on. From the experiments, it is observed that the reconstruction probability of black pixels in the decoded image corresponding to the black pixel in the secret image is always 1, whereas that of white pixels is 0.5 irrespective of the meaningful contents visible in the shares, thus ensuring the value of contrast to always be 50\%. Therefore, a reconstructed image can be easily identified by a human visual system without any computation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ademoye:2016:AME, author = "Oluwakemi A. Ademoye and Niall Murray and Gabriel-Miro Muntean and Gheorghita Ghinea", title = "Audio Masking Effect on Inter-Component Skews in Olfaction-Enhanced Multimedia Presentations", journal = j-TOMM, volume = "12", number = "4", pages = "51:1--51:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957753", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Media-rich content plays a vital role in consumer applications today, as these applications try to find new and interesting ways to engage their users. Video, audio, and the more traditional forms of media content continue to dominate with respect to the use of media content to enhance the user experience. Tactile interactivity has also now become widely popular in modern computing applications, while our olfactory and gustatory senses continue to have a limited role. However, in recent times, there have been significant advancements regarding the use of olfactory media content (i.e., smell), and there are a variety of devices now available to enable its computer-controlled emission. This paper explores the impact of the audio stream on user perception of olfactory-enhanced video content in the presence of skews between the olfactory and video media. This research uses the results from two experimental studies of user-perceived quality of olfactory-enhanced multimedia, where audio was present and absent, respectively. Specifically, the paper shows that the user Quality of Experience (QoE) is generally higher in the absence of audio for nearly perfect synchronized olfactory-enhanced multimedia presentations (i.e., an olfactory media skew of between {-10,+10s}); however, for greater olfactory media skews (ranging between {-30s;-10s} and {+10s, +30s}) user QoE is higher when the audio stream is present. It can be concluded that the presence of the audio has the ability to mask larger synchronization skews between the other media components in olfaction-enhanced multimedia presentations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhong:2016:FED, author = "Sheng-Hua Zhong and Yan Liu and Kien A. Hua", title = "Field Effect Deep Networks for Image Recognition with Incomplete Data", journal = j-TOMM, volume = "12", number = "4", pages = "52:1--52:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957754", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image recognition with incomplete data is a well-known hard problem in computer vision and machine learning. This article proposes a novel deep learning technique called Field Effect Bilinear Deep Networks (FEBDN) for this problem. To address the difficulties of recognizing incomplete data, we design a novel second-order deep architecture with the Field Effect Restricted Boltzmann Machine, which models the reliability of the delivered information according to the availability of the features. Based on this new architecture, we propose a new three-stage learning procedure with field effect bilinear initialization, field effect abstraction and estimation, and global fine-tuning with missing features adjustment. By integrating the reliability of features into the new learning procedure, the proposed FEBDN can jointly determine the classification boundary and estimate the missing features. FEBDN has demonstrated impressive performance on recognition and estimation tasks in various standard datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yan:2016:UVR, author = "Ming Yan and Jitao Sang and Changsheng Xu and M. Shamim Hossain", title = "A Unified Video Recommendation by Cross-Network User Modeling", journal = j-TOMM, volume = "12", number = "4", pages = "53:1--53:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957755", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online video sharing sites are increasingly encouraging their users to connect to the social network venues such as Facebook and Twitter, with goals to boost user interaction and better disseminate the high-quality video content. This in turn provides huge possibilities to conduct cross-network collaboration for personalized video recommendation. However, very few efforts have been devoted to leveraging users' social media profiles in the auxiliary network to capture and personalize their video preferences, so as to recommend videos of interest. In this article, we propose a unified YouTube video recommendation solution by transferring and integrating users' rich social and content information in Twitter network. While general recommender systems often suffer from typical problems like cold-start and data sparsity, our proposed recommendation solution is able to effectively learn from users' abundant auxiliary information on Twitter for enhanced user modeling and well address the typical problems in a unified framework. In this framework, two stages are mainly involved: (1) auxiliary-network data transfer, where user preferences are transferred from an auxiliary network by learning cross-network knowledge associations; and (2) cross-network data integration, where transferred user preferences are integrated with the observed behaviors on a target network in an adaptive fashion. Experimental results show that the proposed cross-network collaborative solution achieves superior performance not only in terms of accuracy, but also in improving the diversity and novelty of the recommended videos.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jiang:2016:CVI, author = "Yijing Jiang and Shanyu Tang and Liping Zhang and Muzhou Xiong and Yau Jim Yip", title = "Covert Voice over {Internet} Protocol Communications with Packet Loss Based on Fractal Interpolation", journal = j-TOMM, volume = "12", number = "4", pages = "54:1--54:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2961053", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The last few years have witnessed an explosive growth in the research of information hiding in multimedia objects, but few studies have taken into account packet loss in multimedia networks. As one of the most popular real-time services in the Internet, Voice over Internet Protocol (VoIP) contributes to a large part of network traffic for its advantages of real time, high flow, and low cost. So packet loss is inevitable in multimedia networks and affects the performance of VoIP communications. In this study, a fractal-based VoIP steganographic approach was proposed to realize covert VoIP communications in the presence of packet loss. In the proposed scheme, secret data to be hidden were divided into blocks after being encrypted with the block cipher, and each block of the secret data was then embedded into VoIP streaming packets. The VoIP packets went through a packet-loss system based on Gilbert model which simulates a real network situation. And a prediction model based on fractal interpolation was built to decide whether a VoIP packet was suitable for data hiding. The experimental results indicated that the speech quality degradation increased with the escalating packet-loss level. The average variance of speech quality metrics (PESQ score) between the ``no-embedding'' speech samples and the ``with-embedding'' stego-speech samples was about 0.717, and the variances narrowed with the increasing packet-loss level. Both the average PESQ scores and the SNR values of stego-speech samples and the data-retrieving rates had almost the same varying trends when the packet-loss level increased, indicating that the success rate of the fractal prediction model played an important role in the performance of covert VoIP communications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2016:SFM, author = "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu", title = "Semantic Feature Mining for Video Event Understanding", journal = j-TOMM, volume = "12", number = "4", pages = "55:1--55:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2962719", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Content-based video understanding is extremely difficult due to the semantic gap between low-level vision signals and the various semantic concepts (object, action, and scene) in videos. Though feature extraction from videos has achieved significant progress, most of the previous methods rely only on low-level features, such as the appearance and motion features. Recently, visual-feature extraction has been improved significantly with machine-learning algorithms, especially deep learning. However, there is still not enough work focusing on extracting semantic features from videos directly. The goal of this article is to adopt unlabeled videos with the help of text descriptions to learn an embedding function, which can be used to extract more effective semantic features from videos when only a few labeled samples are available for video recognition. To achieve this goal, we propose a novel embedding convolutional neural network (ECNN). We evaluate our algorithm by comparing its performance on three challenging benchmarks with several popular state-of-the-art methods. Extensive experimental results show that the proposed ECNN consistently and significantly outperforms the existing methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Nilsson:2016:ASD, author = "Tommy Nilsson and Carl Hogsden and Charith Perera and Saeed Aghaee and David J. Scruton and Andreas Lund and Alan F. Blackwell", title = "Applying Seamful Design in Location-Based Mobile Museum Applications", journal = j-TOMM, volume = "12", number = "4", pages = "56:1--56:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2962720", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 25 07:28:05 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The application of mobile computing is currently altering patterns of our behavior to a greater degree than perhaps any other invention. In combination with the introduction of power-efficient wireless communication technologies, such as Bluetooth Low Energy (BLE), designers are today increasingly empowered to shape the way we interact with our physical surroundings and thus build entirely new experiences. However, our evaluations of BLE and its abilities to facilitate mobile location-based experiences in public environments revealed a number of potential problems. Most notably, the position and orientation of the user in combination with various environmental factors, such as crowds of people traversing the space, were found to cause major fluctuations of the received BLE signal strength. These issues are rendering a seamless functioning of any location-based application practically impossible. Instead of achieving seamlessness by eliminating these technical issues, we thus choose to advocate the use of a seamful approach, that is, to reveal and exploit these problems and turn them into a part of the actual experience. In order to demonstrate the viability of this approach, we designed, implemented, and evaluated the Ghost Detector -an educational location-based museum game for children. By presenting a qualitative evaluation of this game and by motivating our design decisions, this article provides insight into some of the challenges and possible solutions connected to the process of developing location-based BLE-enabled experiences for public cultural spaces.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yan:2017:LCI, author = "Zheng Yan", title = "Learning from Collective Intelligence: Feature Learning Using Social Images and Tags", journal = j-TOMM, volume = "13", number = "1", pages = "1:1--1:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2978656", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Feature representation for visual content is the key to the progress of many fundamental applications such as annotation and cross-modal retrieval. Although recent advances in deep feature learning offer a promising route towards these tasks, they are limited in application domains where high-quality and large-scale training data are expensive to obtain. In this article, we propose a novel deep feature learning paradigm based on social collective intelligence, which can be acquired from the inexhaustible social multimedia content on the Web, in particular, largely social images and tags. Differing from existing feature learning approaches that rely on high-quality image-label supervision, our weak supervision is acquired by mining the visual-semantic embeddings from noisy, sparse, and diverse social image collections. The resultant image-word embedding space can be used to (1) fine-tune deep visual models for low-level feature extractions and (2) seek sparse representations as high-level cross-modal features for both image and text. We offer an easy-to-use implementation for the proposed paradigm, which is fast and compatible with any state-of-the-art deep architectures. Extensive experiments on several benchmarks demonstrate that the cross-modal features learned by our paradigm significantly outperforms others in various applications such as content-based retrieval, classification, and image captioning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheung:2017:PVT, author = "Ming Cheung and James She and Alvin Junus and Lei Cao", title = "Prediction of Virality Timing Using Cascades in Social Media", journal = j-TOMM, volume = "13", number = "1", pages = "2:1--2:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2978771", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Predicting content going viral in social networks is attractive for viral marketing, advertisement, entertainment, and other applications, but it remains a challenge in the big data era today. Previous works mainly focus on predicting the possible popularity of content rather than the timing of reaching such popularity. This work proposes a novel yet practical iterative algorithm to predict virality timing, in which the correlation between the timing and growth of content popularity is captured by using its own big data naturally generated from users' sharing. Such data is not only able to correlate the dynamics and associated timings in social cascades of viral content but also can be useful to self-correct the predicted timing against the actual timing of the virality in each iterative prediction. The proposed prediction algorithm is verified by datasets from two popular social networks-Twitter and Digg-as well as two synthesized datasets with extreme network densities and infection rates. With about 50\% of the required content virality data available (i.e., halfway before reaching its actual virality timing), the error of the predicted timing is proven to be bounded within a 40\% deviation from the actual timing. To the best of our knowledge, this is the first work that predicts content virality timing iteratively by capturing social cascades dynamics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chiu:2017:AAS, author = "Chih-Yi Chiu and Yu-Cyuan Liou and Amorntip Prayoonwong", title = "Approximate Asymmetric Search for Binary Embedding Codes", journal = j-TOMM, volume = "13", number = "1", pages = "3:1--3:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2990504", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose a method of approximate asymmetric nearest-neighbor search for binary embedding codes. The asymmetric distance takes advantage of less information loss at the query side. However, calculating asymmetric distances through exhaustive search is prohibitive in a large-scale dataset. We present a novel method, called multi-index voting, that integrates the multi-index hashing technique with a voting mechanism to select appropriate candidates and calculate their asymmetric distances. We show that the candidate selection scheme can be formulated as the tail of the binomial distribution function. In addition, a binary feature selection method based on minimal quantization error is proposed to address the memory insufficiency issue and improve the search accuracy. Substantial experimental evaluations were made to demonstrate that the proposed method can yield an approximate accuracy to the exhaustive search method while significantly accelerating the runtime. For example, one result shows that in a dataset of one billion 256-bit binary codes, examining only 0.5\% of the dataset, can reach 95--99\% close accuracy to the exhaustive search method and accelerate the search by 73--128 times. It also demonstrates an excellent tradeoff between the search accuracy and time efficiency compared to the state-of-the-art nearest-neighbor search methods. Moreover, the proposed feature selection method shows its effectiveness and improves the accuracy up to 8.35\% compared with other feature selection methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Miller:2017:QBL, author = "Konstantin Miller and Abdel-Karim Al-Tamimi and Adam Wolisz", title = "{QoE}-Based Low-Delay Live Streaming Using Throughput Predictions", journal = j-TOMM, volume = "13", number = "1", pages = "4:1--4:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2990505", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, Hypertext Transfer Protocol (HTTP)-based adaptive streaming has become the de facto standard for video streaming over the Internet. It allows clients to dynamically adapt media characteristics to the varying network conditions to ensure a high quality of experience (QoE)-that is, minimize playback interruptions while maximizing video quality at a reasonable level of quality changes. In the case of live streaming, this task becomes particularly challenging due to the latency constraints. The challenge further increases if a client uses a wireless access network, where the throughput is subject to considerable fluctuations. Consequently, live streams often exhibit latencies of up to 20 to 30 seconds. In the present work, we introduce an adaptation algorithm for HTTP-based live streaming called LOLYPOP (short for low-latency prediction-based adaptation), which is designed to operate with a transport latency of a few seconds. To reach this goal, LOLYPOP leverages Transmission Control Protocol throughput predictions on multiple time scales, from 1 to 10 seconds, along with estimations of the relative prediction error distributions. In addition to satisfying the latency constraint, the algorithm heuristically maximizes the QoE by maximizing the average video quality as a function of the number of skipped segments and quality transitions. To select an efficient prediction method, we studied the performance of several time series prediction methods in IEEE 802.11 wireless access networks. We evaluated LOLYPOP under a large set of experimental conditions, limiting the transport latency to 3 seconds, against a state-of-the-art adaptation algorithm called FESTIVE. We observed that the average selected video representation index is by up to a factor of 3 higher than with the baseline approach. We also observed that LOLYPOP is able to reach points from a broader region in the QoE space, and thus it is better adjustable to the user profile or service provider requirements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ranasinghe:2017:DLS, author = "Nimesha Ranasinghe and Ellen Yi-Luen Do", title = "Digital Lollipop: Studying Electrical Stimulation on the Human Tongue to Simulate Taste Sensations", journal = j-TOMM, volume = "13", number = "1", pages = "5:1--5:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2996462", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Among the five primary senses, the sense of taste is the least explored as a form of digital media applied in Human--Computer Interface. This article presents an experimental instrument, the Digital Lollipop, for digitally simulating the sensation of taste (gustation) by utilizing electrical stimulation on the human tongue. The system is capable of manipulating the properties of electric currents (magnitude, frequency, and polarity) to formulate different stimuli. To evaluate the effectiveness of this method, the system was experimentally tested in two studies. The first experiment was conducted using separate regions of the human tongue to record occurrences of basic taste sensations and their respective intensity levels. The results indicate occurrences of sour, salty, bitter, and sweet sensations from different regions of the tongue. One of the major discoveries of this experiment was that the sweet taste emerges via an inverse-current mechanism, which deserves further research in the future. The second study was conducted to compare natural and artificial (virtual) sour taste sensations and examine the possibility of effectively controlling the artificial sour taste at three intensity levels (mild, medium, and strong). The proposed method is attractive since it does not require any chemical solutions and facilitates further research opportunities in several directions including human--computer interaction, virtual reality, food and beverage, as well as medicine.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Min:2017:FPT, author = "Xiongkuo Min and Guangtao Zhai and Ke Gu and Xiaokang Yang", title = "Fixation Prediction through Multimodal Analysis", journal = j-TOMM, volume = "13", number = "1", pages = "6:1--6:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2996463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose to predict human eye fixation through incorporating both audio and visual cues. Traditional visual attention models generally make the utmost of stimuli's visual features, yet they bypass all audio information. In the real world, however, we not only direct our gaze according to visual saliency, but also are attracted by salient audio cues. Psychological experiments show that audio has an influence on visual attention, and subjects tend to be attracted by the sound sources. Therefore, we propose fusing both audio and visual information to predict eye fixation. In our proposed framework, we first localize the moving--sound-generating objects through multimodal analysis and generate an audio attention map. Then, we calculate the spatial and temporal attention maps using the visual modality. Finally, the audio, spatial, and temporal attention maps are fused to generate the final audiovisual saliency map. The proposed method is applicable to scenes containing moving--sound-generating objects. We gather a set of video sequences and collect eye-tracking data under an audiovisual test condition. Experiment results show that we can achieve better eye fixation prediction performance when taking both audio and visual cues into consideration, especially in some typical scenes in which object motion and audio are highly correlated.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chu:2017:POI, author = "Wei-Ta Chu and Chih-Hao Chiu", title = "Predicting Occupation from Images by Combining Face and Body Context Information", journal = j-TOMM, volume = "13", number = "1", pages = "7:1--7:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3009911", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Facial images embed age, gender, and other rich information that is implicitly related to occupation. In this work, we advocate that occupation prediction from a single facial image is a doable computer vision problem. We extract multilevel hand-crafted features associated with locality-constrained linear coding and convolutional neural network features as image occupation descriptors. To avoid the curse of dimensionality and overfitting, a boost strategy called multichannel SVM is used to integrate features from face and body. Intra- and interclass visual variations are jointly considered in the boosting framework to further improve performance. In the evaluation, we verify the effectiveness of predicting occupation from face and demonstrate promising performance obtained by combining face and body information. More importantly, our work further integrates deep features into the multichannel SVM framework and shows significantly better performance over the state of the art.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Xu:2017:CSA, author = "Jingxi Xu and Benjamin W. Wah", title = "Consistent Synchronization of Action Order with Least Noticeable Delays in Fast-Paced Multiplayer Online Games", journal = j-TOMM, volume = "13", number = "1", pages = "8:1--8:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3003727", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "When running multiplayer online games on IP networks with losses and delays, the order of actions may be changed when compared to the order run on an ideal network with no delays and losses. To maintain a proper ordering of events, traditional approaches either use rollbacks to undo certain actions or local lags to introduce additional delays. Both may be perceived by players because their changes are beyond the just-noticeable-difference (JND) threshold. In this article, we propose a novel method for ensuring a strongly consistent completion order of actions, where strong consistency refers to the same completion order as well as the same interval between any completion time and the corresponding ideal reference completion time under no network delay. We find that small adjustments within the JND on the duration of an action would not be perceivable, as long as the duration is comparable to the network round-trip time. We utilize this property to control the vector of durations of actions and formulate the search of the vector as a multidimensional optimization problem. By using the property that players are generally more sensitive to the most prominent delay effect (with the highest probability of noticeability P$_{notice}$ or the probability of correctly noticing a change when compared to the reference), we prove that the optimal solution occurs when P$_{notice}$ of the individual adjustments are equal. As this search can be done efficiently in polynomial time ($\approx$ 5ms) with a small amount of space ($\approx$ 160KB), the search can be done at runtime to determine the optimal control. Last, we evaluate our approach on the popular open-source online shooting game BZFlag.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Schramm:2017:ATS, author = "Rodrigo Schramm and Helena {De Souza Nunes} and Cl{\'a}udio Rosito Jung", title = "Audiovisual Tool for {Solf{\`e}ge} Assessment", journal = j-TOMM, volume = "13", number = "1", pages = "9:1--9:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3007194", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Solf{\`e}ge is a general technique used in the music learning process that involves the vocal performance of melodies, regarding the time and duration of musical sounds as specified in the music score, properly associated with the meter-mimicking performed by hand movement. This article presents an audiovisual approach for automatic assessment of this relevant musical study practice. The proposed system combines the gesture of meter-mimicking (video information) with the melodic transcription (audio information), where hand movement works as a metronome, controlling the time flow (tempo) of the musical piece. Thus, meter-mimicking is used to align the music score (ground truth) with the sung melody, allowing assessment even in time-dynamic scenarios. Audio analysis is applied to achieve the melodic transcription of the sung notes and the solf{\`e}ge performances are evaluated by a set of Bayesian classifiers that were generated from real evaluations done by experts listeners.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2017:IRS, author = "Haojun Wu and Yong Wang and Jiwu Huang", title = "Identification of Reconstructed Speech", journal = j-TOMM, volume = "13", number = "1", pages = "10:1--10:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3004055", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Both voice conversion and hidden Markov model-- (HMM) based speech synthesis can be used to produce artificial voices of a target speaker. They have shown great negative impacts on speaker verification (SV) systems. In order to enhance the security of SV systems, the techniques to detect converted/synthesized speech should be taken into consideration. During voice conversion and HMM-based synthesis, speech reconstruction is applied to transform a set of acoustic parameters to reconstructed speech. Hence, the identification of reconstructed speech can be used to distinguish converted/synthesized speech from human speech. Several related works on such identification have been reported. The equal error rates (EERs) lower than 5\% of detecting reconstructed speech have been achieved. However, through the cross-database evaluations on different speech databases, we find that the EERs of several testing cases are higher than 10\%. The robustness of detection algorithms to different speech databases needs to be improved. In this article, we propose an algorithm to identify the reconstructed speech. Three different speech databases and two different reconstruction methods are considered in our work, which has not been addressed in the reported works. The high-dimensional data visualization approach is used to analyze the effect of speech reconstruction on Mel-frequency cepstral coefficients (MFCC) of speech signals. The Gaussian mixture model supervectors of MFCC are used as acoustic features. Furthermore, a set of commonly used classification algorithms are applied to identify reconstructed speech. According to the comparison among different classification methods, linear discriminant analysis-ensemble classifiers are chosen in our algorithm. Extensive experimental results show that the EERs lower than 1\% can be achieved by the proposed algorithm in most cases, outperforming the reported state-of-the-art identification techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gaj:2017:DCR, author = "Sibaji Gaj and Aditya Kanetkar and Arijit Sur and Prabin Kumar Bora", title = "Drift-Compensated Robust Watermarking Algorithm for {H.265\slash HEVC} Video Stream", journal = j-TOMM, volume = "13", number = "1", pages = "11:1--11:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3009910", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "It has been observed in the recent literature that the drift error due to watermarking degrades the visual quality of the embedded video. The existing drift error handling strategies for recent video standards such as H.264 may not be directly applicable for upcoming high-definition video standards (such as High Efficiency Video Coding (HEVC)) due to different compression architecture. In this article, a compressed domain watermarking scheme is proposed for H.265/HEVC bit stream that can handle drift error propagation both for intra- and interprediction process. Additionally, the proposed scheme shows adequate robustness against recompression attack as well as common image processing attacks while maintaining decent visual quality. A comprehensive set of experiments has been carried out to justify the efficacy of the proposed scheme over the existing literature.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dutta:2017:EFC, author = "Tanima Dutta and Hari Prabhat Gupta", title = "An Efficient Framework for Compressed Domain Watermarking in {$P$} Frames of High-Efficiency Video Coding ({HEVC})-Encoded Video", journal = j-TOMM, volume = "13", number = "1", pages = "12:1--12:??", month = jan, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3002178", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Jan 18 17:18:28 MST 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Digital watermarking has received much attention in recent years as a promising solution to copyright protection. Video watermarking in compressed domain has gained importance since videos are stored and transmitted in a compressed format. This decreases the overhead to fully decode and re-encode the video for embedding and extraction of the watermark. High Efficiency Video Coding (HEVC/H.265) is the latest and most efficient video compression standard and a successor to H.264 Advanced Video Coding. In this article, we propose a robust watermarking framework for HEVC-encoded video using informed detector. A readable watermark is embedded invisibly in P frames for better perceptual quality. Our framework imposes security and robustness by selecting appropriate blocks using a random key and the spatio-temporal characteristics of the compressed video. A detail analysis of the strengths of different compressed domain features is performed for implementing the watermarking framework. We experimentally demonstrate the utility of the proposed work. The results show that the proposed work effectively limits the increase in video bitrate and degradation in perceptual quality. The proposed framework is robust against re-encoding and image processing attacks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lisanti:2017:MKC, author = "Giuseppe Lisanti and Svebor Karaman and Iacopo Masi", title = "Multichannel-Kernel Canonical Correlation Analysis for Cross-View Person Reidentification", journal = j-TOMM, volume = "13", number = "2", pages = "13:1--13:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3038916", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we introduce a method to overcome one of the main challenges of person reidentification in multicamera networks, namely cross-view appearance changes. The proposed solution addresses the extreme variability of person appearance in different camera views by exploiting multiple feature representations. For each feature, kernel canonical correlation analysis with different kernels is employed to learn several projection spaces in which the appearance correlation between samples of the same person observed from different cameras is maximized. An iterative logistic regression is finally used to select and weight the contributions of each projection and perform the matching between the two views. Experimental evaluation shows that the proposed solution obtains comparable performance on the VIPeR and PRID 450s datasets and improves on the PRID and CUHK01 datasets with respect to the state of the art.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ye:2017:TOM, author = "Jun Ye and Hao Hu and Guo-Jun Qi and Kien A. Hua", title = "A Temporal Order Modeling Approach to Human Action Recognition from Multimodal Sensor Data", journal = j-TOMM, volume = "13", number = "2", pages = "14:1--14:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3038917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "From wearable devices to depth cameras, researchers have exploited various multimodal data to recognize human actions for applications, such as video gaming, education, and healthcare. Although there many successful techniques have been presented in the literature, most current approaches have focused on statistical or local spatiotemporal features and do not explicitly explore the temporal dynamics of the sensor data. However, human action data contain rich temporal structure information that can characterize the unique underlying patterns of different action categories. From this perspective, we propose a novel temporal order modeling approach to human action recognition. Specifically, we explore subspace projections to extract the latent temporal patterns from different human action sequences. The temporal order between these patterns are compared, and the index of the pattern that appears first is used to encode the entire sequence. This process is repeated multiple times and produces a compact feature vector representing the temporal dynamics of the sequence. Human action recognition can then be efficiently solved by the nearest neighbor search based on the Hamming distance between these compact feature vectors. We further introduce a sequential optimization algorithm to learn the optimized projections that preserve the pairwise label similarity of the action sequences. Experimental results on two public human action datasets demonstrate the superior performance of the proposed technique in both accuracy and efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2017:MCL, author = "Shuai Wang and Yang Cong and Huijie Fan and Baojie Fan and Lianqing Liu and Yunsheng Yang and Yandong Tang and Huaici Zhao and Haibin Yu", title = "Multi-Class Latent Concept Pooling for Computer-Aided Endoscopy Diagnosis", journal = j-TOMM, volume = "13", number = "2", pages = "15:1--15:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3051481", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Successful computer-aided diagnosis systems typically rely on training datasets containing sufficient and richly annotated images. However, detailed image annotation is often time consuming and subjective, especially for medical images, which becomes the bottleneck for the collection of large datasets and then building computer-aided diagnosis systems. In this article, we design a novel computer-aided endoscopy diagnosis system to deal with the multi-classification problem of electronic endoscopy medical records (EEMRs) containing sets of frames, while labels of EEMRs can be mined from the corresponding text records using an automatic text-matching strategy without human special labeling. With unambiguous EEMR labels and ambiguous frame labels, we propose a simple but effective pooling scheme called Multi-class Latent Concept Pooling, which learns a codebook from EEMRs with different classes step by step and encodes EEMRs based on a soft weighting strategy. In our method, a computer-aided diagnosis system can be extended to new unseen classes with ease and applied to the standard single-instance classification problem even though detailed annotated images are unavailable. In order to validate our system, we collect 1,889 EEMRs with more than 59K frames and successfully mine labels for 348 of them. The experimental results show that our proposed system significantly outperforms the state-of-the-art methods. Moreover, we apply the learned latent concept codebook to detect the abnormalities in endoscopy images and compare it with a supervised learning classifier, and the evaluation shows that our codebook learning method can effectively extract the true prototypes related to different classes from the ambiguous data.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Demirbilek:2017:MLB, author = "Edip Demirbilek and Jean-Charles Gr{\'e}goire", title = "Machine Learning-Based Parametric Audiovisual Quality Prediction Models for Real-Time Communications", journal = j-TOMM, volume = "13", number = "2", pages = "16:1--16:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3051482", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In order to mechanically predict audiovisual quality in interactive multimedia services, we have developed machine learning--based no-reference parametric models. We have compared Decision Trees--based ensemble methods, Genetic Programming and Deep Learning models that have one and more hidden layers. We have used the Institut national de la recherche scientifique (INRS) audiovisual quality dataset specifically designed to include ranges of parameters and degradations typically seen in real-time communications. Decision Trees--based ensemble methods have outperformed both Deep Learning-- and Genetic Programming--based models in terms of Root-Mean-Square Error (RMSE) and Pearson correlation values. We have also trained and developed models on various publicly available datasets and have compared our results with those of these original models. Our studies show that Random Forests--based prediction models achieve high accuracy for both the INRS audiovisual quality dataset and other publicly available comparable datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gokhale:2017:CCN, author = "Vineet Gokhale and Jayakrishnan Nair and Subhasis Chaudhuri", title = "Congestion Control for Network-Aware Telehaptic Communication", journal = j-TOMM, volume = "13", number = "2", pages = "17:1--17:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3052821", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Telehaptic applications involve delay-sensitive multimedia communication between remote locations with distinct Quality of Service (QoS) requirements for different media components. These QoS constraints pose a variety of challenges, especially when the communication occurs over a shared network, with unknown and time-varying cross-traffic. In this work, we propose a transport layer congestion control protocol for telehaptic applications operating over shared networks, termed as Dynamic Packetization Module (DPM). DPM is a lossless, network-aware protocol that tunes the telehaptic packetization rate based on the level of congestion in the network. To monitor the network congestion, we devise a novel network feedback module, which communicates the end-to-end delays encountered by the telehaptic packets to the respective transmitters with negligible overhead. Via extensive simulations, we show that DPM meets the QoS requirements of telehaptic applications over a wide range of network cross-traffic conditions. We also report qualitative results of a real-time telepottery experiment with several human subjects, which reveal that DPM preserves the quality of telehaptic activity even under heavily congested network scenarios. Finally, we compare the performance of DPM with several previously proposed telehaptic communication protocols and demonstrate that DPM outperforms these protocols.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sobhani:2017:VBA, author = "Ashkan Sobhani and Abdulsalam Yassine and Shervin Shirmohammadi", title = "A Video Bitrate Adaptation and Prediction Mechanism for {HTTP} Adaptive Streaming", journal = j-TOMM, volume = "13", number = "2", pages = "18:1--18:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3052822", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The Hypertext Transfer Protocol (HTTP) Adaptive Streaming (HAS) has now become ubiquitous and accounts for a large amount of video delivery over the Internet. But since the Internet is prone to bandwidth variations, HAS's up and down switching between different video bitrates to keep up with bandwidth variations leads to a reduction in Quality of Experience (QoE). In this article, we propose a video bitrate adaptation and prediction mechanism based on Fuzzy logic for HAS players, which takes into consideration the estimate of available network bandwidth as well as the predicted buffer occupancy level in order to proactively and intelligently respond to current conditions. This leads to two contributions: First, it allows HAS players to take appropriate actions, sooner than existing methods, to prevent playback interruptions caused by buffer underrun, reducing the ON-OFF traffic phenomena associated with current approaches and increasing the QoE. Second, it facilitates fair sharing of bandwidth among competing players at the bottleneck link. We present the implementation of our proposed mechanism and provide both empirical/QoE analysis and performance comparison with existing work. Our results show that, compared to existing systems, our system has (1) better fairness among multiple competing players by almost 50\% on average and as much as 80\% as indicated by Jain's fairness index and (2) better perceived quality of video by almost 8\% on average and as much as 17\%, according to the estimate the Mean Opinion Score (eMOS) model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Grant:2017:CSU, author = "Jason M. Grant and Patrick J. Flynn", title = "Crowd Scene Understanding from Video: a Survey", journal = j-TOMM, volume = "13", number = "2", pages = "19:1--19:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3052930", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Crowd video analysis has applications in crowd management, public space design, and visual surveillance. Example tasks potentially aided by automated analysis include anomaly detection (such as a person walking against the grain of traffic or rapid assembly/dispersion of groups of people), population and density measurements, and interactions between groups of people. This survey explores crowd analysis as it relates to two primary research areas: crowd statistics and behavior understanding. First, we survey methods for counting individuals and approximating the density of the crowd. Second, we showcase research efforts on behavior understanding as related to crowds. These works focus on identifying groups, interactions within small groups, and abnormal activity detection such as riots and bottlenecks in large crowds. Works presented in this section also focus on tracking groups of individuals, either as a single entity or a subset of individuals within the frame of reference. Finally, a summary of datasets available for crowd activity video research is provided.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hussein:2017:VJF, author = "Fairouz Hussein and Massimo Piccardi", title = "{V-JAUNE}: a Framework for Joint Action Recognition and Video Summarization", journal = j-TOMM, volume = "13", number = "2", pages = "20:1--20:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3063532", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video summarization and action recognition are two important areas of multimedia video analysis. While these two areas have been tackled separately to date, in this article, we present a latent structural SVM framework to recognize the action and derive the summary of a video in a joint, simultaneous fashion. Efficient inference is provided by a submodular score function that accounts for the action and summary jointly. In this article, we also define a novel measure to evaluate the quality of a predicted video summary against the annotations of multiple annotators. Quantitative and qualitative results over two challenging action datasets-the ACE and MSR DailyActivity3D datasets-show that the proposed joint approach leads to higher action recognition accuracy and equivalent or better summary quality than comparable approaches that perform these tasks separately.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cizmeci:2017:MSM, author = "Burak Cizmeci and Xiao Xu and Rahul Chaudhari and Christoph Bachhuber and Nicolas Alt and Eckehard Steinbach", title = "A Multiplexing Scheme for Multimodal Teleoperation", journal = j-TOMM, volume = "13", number = "2", pages = "21:1--21:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3063594", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article proposes an application-layer multiplexing scheme for teleoperation systems with multimodal feedback (video, audio, and haptics). The available transmission resources are carefully allocated to avoid delay-jitter for the haptic signal potentially caused by the size and arrival time of the video and audio data. The multiplexing scheme gives high priority to the haptic signal and applies a preemptive-resume scheduling strategy to stream the audio and video data. The proposed approach estimates the available transmission rate in real time and adapts the video bitrate, data throughput, and force buffer size accordingly. Furthermore, the proposed scheme detects sudden transmission rate drops and applies congestion control to avoid abrupt delay increases and converge promptly to the altered transmission rate. The performance of the proposed scheme is measured objectively in terms of end-to-end signal latencies, packet rates, and peak signal-to-noise ratio (PSNR) for visual quality. Moreover, peak-delay and convergence time measurements are carried out to investigate the performance of the congestion control mode of the system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Su:2017:DDP, author = "Zhuo Su and Kun Zeng and Hanhui Li and Xiaonan Luo", title = "A Dual-Domain Perceptual Framework for Generating Visual Inconspicuous Counterparts", journal = j-TOMM, volume = "13", number = "2", pages = "22:1--22:??", month = may, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3068427", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jun 16 14:48:38 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For a given image, it is a challenging task to generate its corresponding counterpart with visual inconspicuous modification. The complexity of this problem reasons from the high correlativity between the editing operations and vision perception. Essentially, a significant requirement that should be emphasized is how to make the object modifications hard to be found visually in the generative counterparts. In this article, we propose a novel dual-domain perceptual framework to generate visual inconspicuous counterparts, which applies the perceptual bidirectional similarity metric (PBSM) and appearance similarity metric (ASM) to create the dual-domain perception error minimization model. The candidate targets are yielded by the well-known PatchMatch model with the strokes-based interactions and selective object library. By the dual-perceptual evaluation index, all candidate targets are sorted to select out the best result. For demonstration, a series of objective and subjective measurements are used to evaluate the performance of our framework.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Singh:2017:SCB, author = "Priyanka Singh and Balasubramanian Raman and Nishant Agarwal and Pradeep K. Atrey", title = "Secure Cloud-Based Image Tampering Detection and Localization Using {POB} Number System", journal = j-TOMM, volume = "13", number = "3", pages = "23:1--23:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3077140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The benefits of high-end computation infrastructure facilities provided by cloud-based multimedia systems are attracting people all around the globe. However, such cloud-based systems possess security issues as third party servers become involved in them. Rendering data in an unreadable form so that no information is revealed to the cloud data centers will serve as the best solution to these security issues. One such image encryption scheme based on a Permutation Ordered Binary Number System has been proposed in this work. It distributes the image information in totally random shares, which can be stored at the cloud data centers. Further, the proposed scheme authenticates the shares at the pixel level. If any tampering is done at the cloud servers, the scheme can accurately identify the altered pixels via authentication bits and localizes the tampered area. The tampered portion is also reflected back in the reconstructed image that is obtained at the authentic user end. The experimental results validate the efficacy of the proposed scheme against various kinds of possible attacks, tested with a variety of images. The tamper detection accuracy has been computed on a pixel basis and found to be satisfactorily high for most of the tampering scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Thirunarayanan:2017:CSE, author = "Ishwarya Thirunarayanan and Khimya Khetarpal and Sanjeev Koppal and Olivier {Le Meur} and John Shea and Eakta Jain", title = "Creating Segments and Effects on Comics by Clustering Gaze Data", journal = j-TOMM, volume = "13", number = "3", pages = "24:1--24:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3078836", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Traditional comics are increasingly being augmented with digital effects, such as recoloring, stereoscopy, and animation. An open question in this endeavor is identifying where in a comic panel the effects should be placed. We propose a fast, semi-automatic technique to identify effects-worthy segments in a comic panel by utilizing gaze locations as a proxy for the importance of a region. We take advantage of the fact that comic artists influence viewer gaze towards narrative important regions. By capturing gaze locations from multiple viewers, we can identify important regions and direct a computer vision segmentation algorithm to extract these segments. The challenge is that these gaze data are noisy and difficult to process. Our key contribution is to leverage a theoretical breakthrough in the computer networks community towards robust and meaningful clustering of gaze locations into semantic regions, without needing the user to specify the number of clusters. We present a method based on the concept of relative eigen quality that takes a scanned comic image and a set of gaze points and produces an image segmentation. We demonstrate a variety of effects such as defocus, recoloring, stereoscopy, and animations. We also investigate the use of artificially generated gaze locations from saliency models in place of actual gaze locations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Houle:2017:QEC, author = "Michael E. Houle and Xiguo Ma and Vincent Oria and Jichao Sun", title = "Query Expansion for Content-Based Similarity Search Using Local and Global Features", journal = j-TOMM, volume = "13", number = "3", pages = "25:1--25:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3063595", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents an efficient and totally unsupervised content-based similarity search method for multimedia data objects represented by high-dimensional feature vectors. The assumption is that the similarity measure is applicable to feature vectors of arbitrary length. During the offline process, different sets of features are selected by a generalized version of the Laplacian Score in an unsupervised way for individual data objects in the database. Online retrieval is performed by ranking the query object in the feature spaces of candidate objects. Those candidates for which the query object is ranked highly are selected as the query results. The ranking scheme is incorporated into an automated query expansion framework to further improve the semantic quality of the search result. Extensive experiments were conducted on several datasets to show the capability of the proposed method in boosting effectiveness without losing efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Riegler:2017:ACA, author = "Michael Riegler and Konstantin Pogorelov and Sigrun Losada Eskeland and Peter Thelin Schmidt and Zeno Albisser and Dag Johansen and Carsten Griwodz and P{\aa}l Halvorsen and Thomas {De Lange}", title = "From Annotation to Computer-Aided Diagnosis: Detailed Evaluation of a Medical Multimedia System", journal = j-TOMM, volume = "13", number = "3", pages = "26:1--26:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3079765", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Holistic medical multimedia systems covering end-to-end functionality from data collection to aided diagnosis are highly needed, but rare. In many hospitals, the potential value of multimedia data collected through routine examinations is not recognized. Moreover, the availability of the data is limited, as the health care personnel may not have direct access to stored data. However, medical specialists interact with multimedia content daily through their everyday work and have an increasing interest in finding ways to use it to facilitate their work processes. In this article, we present a novel, holistic multimedia system aiming to tackle automatic analysis of video from gastrointestinal (GI) endoscopy. The proposed system comprises the whole pipeline, including data collection, processing, analysis, and visualization. It combines filters using machine learning, image recognition, and extraction of global and local image features. The novelty is primarily in this holistic approach and its real-time performance, where we automate a complete algorithmic GI screening process. We built the system in a modular way to make it easily extendable to analyze various abnormalities, and we made it efficient in order to run in real time. The conducted experimental evaluation proves that the detection and localization accuracy are comparable or even better than existing systems, but it is by far leading in terms of real-time performance and efficient resource consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2017:EPR, author = "Xun Yang and Meng Wang and Richang Hong and Qi Tian and Yong Rui", title = "Enhancing Person Re-identification in a Self-Trained Subspace", journal = j-TOMM, volume = "13", number = "3", pages = "27:1--27:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3089249", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Despite the promising progress made in recent years, person re-identification (re-ID) remains a challenging task due to the complex variations in human appearances from different camera views. For this challenging problem, a large variety of algorithms have been developed in the fully supervised setting, requiring access to a large amount of labeled training data. However, the main bottleneck for fully supervised re-ID is the limited availability of labeled training samples. To address this problem, we propose a self-trained subspace learning paradigm for person re-ID that effectively utilizes both labeled and unlabeled data to learn a discriminative subspace where person images across disjoint camera views can be easily matched. The proposed approach first constructs pseudo-pairwise relationships among unlabeled persons using the k-nearest neighbors algorithm. Then, with the pseudo-pairwise relationships, the unlabeled samples can be easily combined with the labeled samples to learn a discriminative projection by solving an eigenvalue problem. In addition, we refine the pseudo-pairwise relationships iteratively, which further improves learning performance. A multi-kernel embedding strategy is also incorporated into the proposed approach to cope with the non-linearity in a person's appearance and explore the complementation of multiple kernels. In this way, the performance of person re-ID can be greatly enhanced when training data are insufficient. Experimental results on six widely used datasets demonstrate the effectiveness of our approach, and its performance can be comparable to the reported results of most state-of-the-art fully supervised methods while using much fewer labeled data.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2017:RHA, author = "Shih-Yao Lin and Yen-Yu Lin and Chu-Song Chen and Yi-Ping Hung", title = "Recognizing Human Actions with Outlier Frames by Observation Filtering and Completion", journal = j-TOMM, volume = "13", number = "3", pages = "28:1--28:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3089250", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article addresses the problem of recognizing partially observed human actions. Videos of actions acquired in the real world often contain corrupt frames caused by various factors. These frames may appear irregularly, and make the actions only partially observed. They change the appearance of actions and degrade the performance of pretrained recognition systems. In this article, we propose an approach to address the corrupt-frame problem without knowing their locations and durations in advance. The proposed approach includes two key components: outlier filtering and observation completion. The former identifies and filters out unobserved frames, and the latter fills up the filtered parts by retrieving coherent alternatives from training data. Hidden Conditional Random Fields (HCRFs) are then used to recognize the filtered and completed actions. Our approach has been evaluated on three datasets, which contain both fully observed actions and partially observed actions with either real or synthetic corrupt frames. The experimental results show that our approach performs favorably against the other state-of-the-art methods, especially when corrupt frames are present.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Karafotias:2017:IER, author = "Georgios Karafotias and Akiko Teranishi and Georgios Korres and Friederike Eyssel and Scandar Copti and Mohamad Eid", title = "Intensifying Emotional Reactions via Tactile Gestures in Immersive Films", journal = j-TOMM, volume = "13", number = "3", pages = "29:1--29:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092840", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The film industry continuously strives to make visitors' movie experience more immersive and thus, more captivating. This is realized through larger screens, sophisticated speaker systems, and high quality 2D and 3D content. Moreover, a recent trend in the film industry is to incorporate multiple interaction modalities, such as 4D film, to simulate rain, wind, vibration, and heat, in order to intensify viewers' emotional reactions. In this context, humans' sense of touch possesses significant potential for intensifying emotional reactions for the film experience beyond audio-visual sensory modalities. This article presents a framework for authoring tactile cues (tactile gestures as used in this article) and enabling automatic rendering of said gestures to intensify emotional reactions in an immersive film experience. To validate the proposed framework, we conducted an experimental study where tactile gestures are designed and evaluated for the ability to intensify four emotional reactions: high valence-high arousal, high valence-low arousal, low valence-high arousal, and low valence-low arousal. Using a haptic jacket, participants felt tactile gestures that are synchronized with the audio-visual contents of a film. Results demonstrated that (1) any tactile feedback generated a positive user experience; (2) the tactile feedback intensifies emotional reactions when the audio-visual stimuli elicit clear emotional responses, except for low arousal emotional response since tactile gestures seem to always generate excitement; (3) purposed tactile gestures do not seem to significantly outperform randomized tactile gesture for intensifying specific emotional reactions; and (4) using a haptic jacket is not distracting for the users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheung:2017:ASU, author = "Ming Cheung and James She", title = "An Analytic System for User Gender Identification through User Shared Images", journal = j-TOMM, volume = "13", number = "3", pages = "30:1--30:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3095077", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Many social media applications, such as recommendation, virality prediction, and marketing, make use of user gender, which may not be explicitly specified or kept privately. Meanwhile, advanced mobile devices have become part of our lives and a huge amount of content is being generated by users every day, especially user shared images shared by individuals in social networks. This particular form of user generated content is widely accessible to others due to the sharing nature. When user gender is only accessible to exclusive parties, these user shared images are proved to be an easier way to identify user gender. This work investigated 3,152,344 images by 7,450 users from Fotolog and Flickr, two image-oriented social networks. It is observed that users who share visually similar images are more likely to have the same gender. A multimedia big data system that utilizes this phenomenon is proposed for user gender identification with 79\% accuracy. These findings are useful for information or services in any social network with intensive image sharing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Engelbrecht:2017:PDS, author = "Herman A. Engelbrecht and John S. Gilmore", title = "{Pithos}: Distributed Storage for Massive Multi-User Virtual Environments", journal = j-TOMM, volume = "13", number = "3", pages = "31:1--31:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3105577", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "There has been significant research effort into peer-to-peer (P2P) massively multi-user virtual environments (MMVEs). A number of architectures have been proposed to implement the P2P approach; however, the development of fully distributed MMVEs has met with a number of challenges. In this work, we address one of the key remaining challenges of state consistency and persistency in P2P MMVEs. Having reviewed state management and persistency architectures currently receiving research attention, we have identified deficiencies such as lack of load balancing, responsiveness, and scalability. To address these deficiencies, we present Pithos-a reliable, responsive, secure, load-balanced, and scalable distributed storage system, suited to P2P MMVEs. Pithos is designed specifically for P2P MMVEs, and we show that it improves the reliability and responsiveness of storage architectures as compared to existing P2P state persistency architectures. Pithos is implemented as an OverSim simulation running on the OMNeT++ network simulation framework. It is evaluated using up to 10,400 peers, with realistic latency profiles, with up to 15.8 million storage and retrieval requests that are generated to store a total of 2.4 million objects. Each peer in Pithos uses a maximum of 1,950Bps bandwidth to achieve 99.98\% storage reliability, while the most reliable overlay storage configuration tested only achieved 93.65\% reliability, using 2,182Bps bandwidth. Pithos is also more responsive than overlay storage, with an average responsiveness of 0.192s, compared with the average overlay responsiveness of 1.4s when retrieving objects from storage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2017:SDL, author = "Jun Zhang and Meng Wang and Liang Lin and Xun Yang and Jun Gao and Yong Rui", title = "Saliency Detection on Light Field: a Multi-Cue Approach", journal = j-TOMM, volume = "13", number = "3", pages = "32:1--32:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3107956", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Saliency detection has recently received increasing research interest on using high-dimensional datasets beyond two-dimensional images. Despite the many available capturing devices and algorithms, there still exists a wide spectrum of challenges that need to be addressed to achieve accurate saliency detection. Inspired by the success of the light-field technique, in this article, we propose a new computational scheme to detect salient regions by integrating multiple visual cues from light-field images. First, saliency prior maps are generated from several light-field features based on superpixel-level intra-cue distinctiveness, such as color, depth, and flow inherited from different focal planes and multiple viewpoints. Then, we introduce the location prior to enhance the saliency maps. These maps will finally be merged into a single map using a random-search-based weighting strategy. Besides, we refine the object details by employing a two-stage saliency refinement to obtain the final saliency map. In addition, we present a more challenging benchmark dataset for light-field saliency analysis, named HFUT-Lytro, which consists of 255 light fields with a range from 53 to 64 images generated from each light-field image, therein spanning multiple occurrences of saliency detection challenges such as occlusions, cluttered background, and appearance changes. Experimental results show that our approach can achieve 0.6--6.7\% relative improvements over state-of-the-art methods in terms of the F-measure and Precision metrics, which demonstrates the effectiveness of the proposed approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ota:2017:ISI, author = "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and Francesco G. B. {De Natale}", title = "Introduction to Special Issue on Deep Learning for Mobile Multimedia", journal = j-TOMM, volume = "13", number = "3s", pages = "33:1--33:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3088340", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ota:2017:DLM, author = "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and Francesco G. B. {De Natale}", title = "Deep Learning for Mobile Multimedia: a Survey", journal = j-TOMM, volume = "13", number = "3s", pages = "34:1--34:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092831", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep Learning (DL) has become a crucial technology for multimedia computing. It offers a powerful instrument to automatically produce high-level abstractions of complex multimedia data, which can be exploited in a number of applications, including object detection and recognition, speech-to- text, media retrieval, multimodal data analysis, and so on. The availability of affordable large-scale parallel processing architectures, and the sharing of effective open-source codes implementing the basic learning algorithms, caused a rapid diffusion of DL methodologies, bringing a number of new technologies and applications that outperform, in most cases, traditional machine learning technologies. In recent years, the possibility of implementing DL technologies on mobile devices has attracted significant attention. Thanks to this technology, portable devices may become smart objects capable of learning and acting. The path toward these exciting future scenarios, however, entangles a number of important research challenges. DL architectures and algorithms are hardly adapted to the storage and computation resources of a mobile device. Therefore, there is a need for new generations of mobile processors and chipsets, small footprint learning and inference algorithms, new models of collaborative and distributed processing, and a number of other fundamental building blocks. This survey reports the state of the art in this exciting research area, looking back to the evolution of neural networks, and arriving to the most recent results in terms of methodologies, technologies, and applications for mobile environments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Seidenari:2017:DAD, author = "Lorenzo Seidenari and Claudio Baecchi and Tiberio Uricchio and Andrea Ferracani and Marco Bertini and Alberto {Del Bimbo}", title = "Deep Artwork Detection and Retrieval for Automatic Context-Aware Audio Guides", journal = j-TOMM, volume = "13", number = "3s", pages = "35:1--35:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092832", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we address the problem of creating a smart audio guide that adapts to the actions and interests of museum visitors. As an autonomous agent, our guide perceives the context and is able to interact with users in an appropriate fashion. To do so, it understands what the visitor is looking at, if the visitor is moving inside the museum hall, or if he or she is talking with a friend. The guide performs automatic recognition of artworks, and it provides configurable interface features to improve the user experience and the fruition of multimedia materials through semi-automatic interaction. Our smart audio guide is backed by a computer vision system capable of working in real time on a mobile device, coupled with audio and motion sensors. We propose the use of a compact Convolutional Neural Network (CNN) that performs object classification and localization. Using the same CNN features computed for these tasks, we perform also robust artwork recognition. To improve the recognition accuracy, we perform additional video processing using shape-based filtering, artwork tracking, and temporal filtering. The system has been deployed on an NVIDIA Jetson TK1 and a NVIDIA Shield Tablet K1 and tested in a real-world environment (Bargello Museum of Florence).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pouladzadeh:2017:MMF, author = "Parisa Pouladzadeh and Shervin Shirmohammadi", title = "Mobile Multi-Food Recognition Using Deep Learning", journal = j-TOMM, volume = "13", number = "3s", pages = "36:1--36:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3063592", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose a mobile food recognition system that uses the picture of the food, taken by the user's mobile device, to recognize multiple food items in the same meal, such as steak and potatoes on the same plate, to estimate the calorie and nutrition of the meal. To speed up and make the process more accurate, the user is asked to quickly identify the general area of the food by drawing a bounding circle on the food picture by touching the screen. The system then uses image processing and computational intelligence for food item recognition. The advantage of recognizing items, instead of the whole meal, is that the system can be trained with only single item food images. At the training stage, we first use region proposal algorithms to generate candidate regions and extract the convolutional neural network (CNN) features of all regions. Second, we perform region mining to select positive regions for each food category using maximum cover by our proposed submodular optimization method. At the testing stage, we first generate a set of candidate regions. For each region, a classification score is computed based on its extracted CNN features and predicted food names of the selected regions. Since fast response is one of the important parameters for the user who wants to eat the meal, certain heavy computational parts of the application are offloaded to the cloud. Hence, the processes of food recognition and calorie estimation are performed in cloud server. Our experiments, conducted with the FooDD dataset, show an average recall rate of 90.98\%, precision rate of 93.05\%, and accuracy of 94.11\% compared to 50.8\% to 88\% accuracy of other existing food recognition systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bharati:2017:ETC, author = "Sailesh Bharati and Hassan Aboubakr Omar and Weihua Zhuang", title = "Enhancing Transmission Collision Detection for Distributed {TDMA} in Vehicular Networks", journal = j-TOMM, volume = "13", number = "3s", pages = "37:1--37:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092833", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The increasing number of road accidents has led to the evolution of vehicular ad hoc networks (VANETs), which allow vehicles and roadside infrastructure to continuously broadcast safety messages, including necessary information to avoid undesired events on the road. To support reliable broadcast of safety messages, distributed time division multiple access (D-TDMA) protocols are proposed for medium access control in VANETs. Existing D-TDMA protocols react to a transmission failure without distinguishing whether the failure comes from a transmission collision or from a poor radio channel condition, resulting in degraded performance. In this article, we present the importance of transmission failure differentiation due to a poor channel or due to a transmission collision for D-TDMA protocols in vehicular networks. We study the effects of such a transmission failure differentiation on the performance of a node when reserving a time slot to access the transmission channel. Furthermore, we propose a method for transmission failure differentiation, employing the concept of deep-learning techniques, for a node to decide whether to release or continue using its acquired time slot. The proposed method is based on the application of a Markov chain model to estimate the channel state when a transmission failure occurs. The Markov model parameters are dynamically updated by each node (i.e., vehicle or roadside unit) based on information included in the safety messages that are periodically received from neighboring nodes. In addition, from the D-TDMA protocol headers of received messages, a node approximately determines the error in estimating the channel state based on the proposed Markov model and then uses this channel estimation error to further improve subsequent channel state estimations. Through mathematical analysis, we show that transmission failure differentiation, or transmission collision detection, helps a node to efficiently reserve a time slot even with a large number of nodes contending for time slots. Furthermore, through extensive simulations in a highway scenario, we demonstrate that the proposed solution significantly improves the performance of D-TDMA protocols by reducing unnecessary contention on the available time slots, thus increasing the number of nodes having unique time slots for successful broadcast of safety messages.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Vandecasteele:2017:SSC, author = "Florian Vandecasteele and Karel Vandenbroucke and Dimitri Schuurman and Steven Verstockt", title = "{Spott}: On-the-Spot e-Commerce for Television Using Deep Learning-Based Video Analysis Techniques", journal = j-TOMM, volume = "13", number = "3s", pages = "38:1--38:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092834", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Spott is an innovative second screen mobile multimedia application which offers viewers relevant information on objects (e.g., clothing, furniture, food) they see and like on their television screens. The application enables interaction between TV audiences and brands, so producers and advertisers can offer potential consumers tailored promotions, e-shop items, and/or free samples. In line with the current views on innovation management, the technological excellence of the Spott application is coupled with iterative user involvement throughout the entire development process. This article discusses both of these aspects and how they impact each other. First, we focus on the technological building blocks that facilitate the (semi-) automatic interactive tagging process of objects in the video streams. The majority of these building blocks extensively make use of novel and state-of-the-art deep learning concepts and methodologies. We show how these deep learning based video analysis techniques facilitate video summarization, semantic keyframe clustering, and (similar) object retrieval. Secondly, we provide insights in user tests that have been performed to evaluate and optimize the application's user experience. The lessons learned from these open field tests have already been an essential input in the technology development and will further shape the future modifications to the Spott application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2017:TDC, author = "Qingchen Zhang and Laurence T. Yang and Xingang Liu and Zhikui Chen and Peng Li", title = "A {Tucker} Deep Computation Model for Mobile Multimedia Feature Learning", journal = j-TOMM, volume = "13", number = "3s", pages = "39:1--39:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3063593", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, the deep computation model, as a tensor deep learning model, has achieved super performance for multimedia feature learning. However, the conventional deep computation model involves a large number of parameters. Typically, training a deep computation model with millions of parameters needs high-performance servers with large-scale memory and powerful computing units, limiting the growth of the model size for multimedia feature learning on common devices such as portable CPUs and conventional desktops. To tackle this problem, this article proposes a Tucker deep computation model by using the Tucker decomposition to compress the weight tensors in the full-connected layers for multimedia feature learning. Furthermore, a learning algorithm based on the back-propagation strategy is devised to train the parameters of the Tucker deep computation model. Finally, the performance of the Tucker deep computation model is evaluated by comparing with the conventional deep computation model on two representative multimedia datasets, that is, CUAVE and SNAE2, in terms of accuracy drop, parameter reduction, and speedup in the experiments. Results imply that the Tucker deep computation model can achieve a large-parameter reduction and speedup with a small accuracy drop for multimedia feature learning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Timmerer:2017:BPA, author = "Christian Timmerer and Ali C. Begen", title = "Best Papers of the {2016 ACM Multimedia Systems (MMSys) Conference and Workshop on Network and Operating System Support for Digital Audio and Video (NOSSDAV) 2016}", journal = j-TOMM, volume = "13", number = "3s", pages = "40:1--40:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3084539", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Daronco:2017:DRA, author = "Stefano D'aronco and Sergio Mena and Pascal Frossard", title = "Distributed Rate Allocation in Switch-Based Multiparty Videoconferencing System", journal = j-TOMM, volume = "13", number = "3s", pages = "41:1--41:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092835", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multiparty videoconferences, or more generally multiparty video calls, are gaining a lot of popularity as they offer a rich communication experience. These applications have, however, large requirements in terms of both network and computational resources and have to deal with sets of heterogeneous clients. The multiparty videoconferencing systems are usually either based on expensive central nodes, called Multipoint Control Units (MCU), with transcoding capabilities, or on a peer-to-peer architecture where users cooperate to distribute more efficiently the different video streams. Whereas the first class of systems requires an expensive central hardware, the second one depends completely on the redistribution capacity of the users, which sometimes might neither provide sufficient bandwidth nor be reliable enough. In this work, we propose an alternative solution where we use a central node to distribute the video streams, but at the same time we maintain the hardware complexity and the computational requirements of this node as low as possible, for example, it has no video decoding capabilities. We formulate the rate allocation problem as an optimization problem that aims at maximizing the Quality of Service (QoS) of the videoconference. We propose two different distributed algorithms for solving the optimization problem: the first algorithm is able to find an approximate solution of the problem in a one-shot execution, whereas the second algorithm, based on Lagrangian relaxation, performs iterative updates of the optimization variables in order to gradually increase the value of the objective function. The two algorithms, though being disjointed, nicely complement each other. If executed in sequence, they allow us to achieve both a quick approximate rate reallocation, in case of a sudden change of the system conditions, and a precise refinement of the variables, which avoids problems caused by possible faulty approximate solutions. We have further implemented our solution in a network simulator where we show that our rate allocation algorithm is able to properly optimize users' QoS. We also illustrate the benefits of our solution in terms of network usage and overall utility when compared to a baseline heuristic method operating on the same system architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cofano:2017:DPE, author = "Giuseppe Cofano and Luca {De Cicco} and Thomas Zinner and Anh Nguyen-Ngoc and Phuoc Tran-Gia and Saverio Mascolo", title = "Design and Performance Evaluation of Network-assisted Control Strategies for {HTTP} Adaptive Streaming", journal = j-TOMM, volume = "13", number = "3s", pages = "42:1--42:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092836", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article investigates several network-assisted streaming approaches that rely on active cooperation between video streaming applications and the network. We build a Video Control Plane that enforces Video Quality Fairness among concurrent video flows generated by heterogeneous client devices. For this purpose, a max-min fairness optimization problem is solved at runtime. We compare two approaches to actuate the optimal solution in an Software Defined Networking network: The first one allocates network bandwidth slices to video flows, and the second one guides video players in the video bitrate selection. We assess performance through several QoE-related metrics, such as Video Quality Fairness, video quality, and switching frequency. The impact of client-side adaptation algorithms is also investigated.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wisniewski:2017:OAA, author = "Piotr Wisniewski and Jordi Mongay Batalla and Andrzej Beben and Piotr Krawiec and Andrzej Chydzinski", title = "On Optimizing Adaptive Algorithms Based on Rebuffering Probability", journal = j-TOMM, volume = "13", number = "3s", pages = "43:1--43:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092837", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Traditionally, video adaptive algorithms aim to select the representation that better fits to the current download rate. In recent years, a number of new approaches appeared that take into account the buffer occupancy and the probability of video rebuffering as important indicators of the representation to be selected. We propose an optimization of the existing algorithm based on rebuffering probability and argue that the algorithm should avoid the situations when the client buffer is full and the download is stopped, since these situations decrease the efficiency of the algorithm. Reducing full buffer states does not increase the rebuffering probability thanks to a clever management of the client buffer, which analyses the buffer occupancy and downloads higher bitrate representations only in the case of high buffer occupancy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kleinrouweler:2017:SAP, author = "Jan Willem Kleinrouweler and Sergio Cabrero and Pablo Cesar", title = "An {SDN} Architecture for Privacy-Friendly Network-Assisted {DASH}", journal = j-TOMM, volume = "13", number = "3s", pages = "44:1--44:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092838", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is the premier technology for Internet video streaming. DASH efficiently uses existing HTTP-based delivery infrastructures implementing adaptive streaming. However, DASH traffic is bursty in nature. This causes performance problems when DASH players share a network connection or in networks with heavy background traffic. The result is unstable and lower quality video. In this article, we present the design and implementation of a so-called DASH Assisting Network Element (DANE). Our system provides target bitrate signaling and dynamic traffic control. These two mechanisms realize proper bandwidth sharing among clients. Our system is privacy friendly and fully supports encrypted video streams. Trying to improve the streaming experience for users who share a network connection, our system increases the video bitrate and reduces the number of quality switches. We show this through evaluations in our Wi-Fi testbed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2017:DAQ, author = "Cong Wang and Divyashri Bhat and Amr Rizk and Michael Zink", title = "Design and Analysis of {QoE}-Aware Quality Adaptation for {DASH}: a Spectrum-Based Approach", journal = j-TOMM, volume = "13", number = "3s", pages = "45:1--45:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092839", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The dynamics of the application-layer-based control loop of dynamic adaptive streaming over HTTP (DASH) make video bitrate selection for DASH a difficult problem. In this work, we provide a DASH quality adaptation algorithm, named SQUAD, that is specifically tailored to provide a high quality of experience (QoE). We review and provide new insights into the challenges for DASH rate estimation. We found that in addition to the ON-OFF behavior of DASH clients, there exists a discrepancy in the timescales that form the basis of the rate estimates across (i) different video segments and (ii) the rate control loops of DASH and Transmission Control Protocol (TCP). With these observations in mind, we design SQUAD aiming to maximize the average quality bitrate while minimizing the quality variations. We test our implementation of SQUAD together with a number of different quality adaptation algorithms under various conditions in the Global Environment for Networking Innovation testbed, as well as, in a series of measurements over the public Internet. Through a measurement study, we show that by sacrificing little to nothing in average quality bitrate, SQUAD can provide significantly better QoE in terms of quality switching and magnitude. In addition, we show that retransmission of higher-quality segments that were originally received in low-quality is feasible and improves the QoE.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2017:CAC, author = "Cong Zhang and Jiangchuan Liu and Haiyang Wang", title = "Cloud-Assisted Crowdsourced Livecast", journal = j-TOMM, volume = "13", number = "3s", pages = "46:1--46:??", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3095755", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:22 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The past two years have witnessed an explosion of a new generation of livecast services, represented by Twitch.tv, GamingLive, and Dailymotion, to name but a few. With such a livecast service, geo-distributed Internet users can broadcast any event in real-time, for example, game, cooking, drawing, and so on, to viewers of interest. Its crowdsourced nature enables rich interactions among broadcasters and viewers but also introduces great challenges to accommodate their great scales and dynamics. To fulfill the demands from a large number of heterogeneous broadcasters and geo-distributed viewers, expensive server clusters have been deployed to ingest and transcode live streams. Yet our Twitch-based measurement shows that a significant portion of the unpopular and dynamic broadcasters are consuming considerable system resources; in particular, 25\% of bandwidth resources and 30\% of computational capacity are used by the broadcasters who do not have any viewers at all. In this article, through the real-world measurement and data analysis, we show that the public cloud has great potentials to address these scalability challenges. We accordingly present the design of Cloud-assisted Crowdsourced Livecast (CACL) and propose a comprehensive set of solutions for broadcaster partitioning. Our trace-driven evaluations show that our CACL design can smartly assign ingesting and transcoding tasks to the elastic cloud virtual machines, providing flexible and cost-effective system deployment.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dao:2017:TCM, author = "Minh Son Dao", title = "This is the Table of Contents for the most recent online-only supplemental issue {TOMM} 13(3s). {Please} find this supplemental issue in the {ACM Digital Library} and enjoy reading them!", journal = j-TOMM, volume = "13", number = "4", pages = "47:1--47:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3143786", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47e", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2017:SRB, author = "Hong-Bo Zhang and Bineng Zhong and Qing Lei and Ji-Xiang Du and Jialin Peng and Duansheng Chen and Xiao Ke", title = "Sparse Representation-Based Semi-Supervised Regression for People Counting", journal = j-TOMM, volume = "13", number = "4", pages = "47:1--47:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106156", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Label imbalance and the insufficiency of labeled training samples are major obstacles in most methods for counting people in images or videos. In this work, a sparse representation-based semi-supervised regression method is proposed to count people in images with limited data. The basic idea is to predict the unlabeled training data, select reliable samples to expand the labeled training set, and retrain the regression model. In the algorithm, the initial regression model, which is learned from the labeled training data, is used to predict the number of people in the unlabeled training dataset. Then, the unlabeled training samples are regarded as an over-complete dictionary. Each feature of the labeled training data can be expressed as a sparse linear approximation of the unlabeled data. In turn, the labels of the labeled training data can be estimated based on a sparse reconstruction in feature space. The label confidence in labeling an unlabeled sample is estimated by calculating the reconstruction error. The training set is updated by selecting unlabeled samples with minimal reconstruction errors, and the regression model is retrained on the new training set. A co-training style method is applied during the training process. The experimental results demonstrate that the proposed method has a low mean square error and mean absolute error compared with those of state-of-the-art people-counting benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Akhtar:2017:COV, author = "Shahid Akhtar and Andre Beck and Ivica Rimac", title = "Caching Online Video: Analysis and Proposed Algorithm", journal = j-TOMM, volume = "13", number = "4", pages = "48:1--48:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106157", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online video presents new challenges to traditional caching with over a thousand-fold increase in number of assets, rapidly changing popularity of assets and much higher throughput requirements. We propose a new hierarchical filtering algorithm for caching online video HiFi. Our algorithm is designed to optimize hit rate, replacement rate and cache throughput. It has an associated implementation complexity comparable to that of LRU. Our results show that, under typical operator conditions, HiFi can increase edge cache byte hit rate by 5\%--24\% over an LRU policy, but more importantly can increase the RAM or memory byte hit rate by 80\% to 200\% and reduce the replacement rate by more than 100 times! These two factors combined can dramatically increase throughput for most caches. If SSDs are used for storage, the much lower replacement rate may also allow substitution of lower-cost MLC-based SSDs instead of SLC-based SSDs. We extend previous multi-tier analytical models for LRU caches to caches with filtering. We analytically show how HiFi can approach the performance of an optimal caching policy and how to tune HiFi to reach as close to optimal performance as the traffic conditions allow. We develop a realistic simulation environment for online video using statistics from operator traces. We show that HiFi performs within a few percentage points from the optimal solution which was simulated by Belady's MIN algorithm under typical operator conditions", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dang-Nguyen:2017:MRD, author = "Duc-Tien Dang-Nguyen and Luca Piras and Giorgio Giacinto and Giulia Boato and Francesco G. B. {De Natale}", title = "Multimodal Retrieval with Diversification and Relevance Feedback for Tourist Attraction Images", journal = j-TOMM, volume = "13", number = "4", pages = "49:1--49:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3103613", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we present a novel framework that can produce a visual description of a tourist attraction by choosing the most diverse pictures from community-contributed datasets, which describe different details of the queried location. The main strength of the proposed approach is its flexibility that permits us to filter out non-relevant images and to obtain a reliable set of diverse and relevant images by first clustering similar images according to their textual descriptions and their visual content and then extracting images from different clusters according to a measure of the user's credibility. Clustering is based on a two-step process, where textual descriptions are used first and the clusters are then refined according to the visual features. The degree of diversification can be further increased by exploiting users' judgments on the results produced by the proposed algorithm through a novel approach, where users not only provide a relevance feedback but also a diversity feedback. Experimental results performed on the MediaEval 2015 ``Retrieving Diverse Social Images'' dataset show that the proposed framework can achieve very good performance both in the case of automatic retrieval of diverse images and in the case of the exploitation of the users' feedback. The effectiveness of the proposed approach has been also confirmed by a small case study involving a number of real users.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{FujiiPontello:2017:MUR, author = "Luciana {Fujii Pontello} and Pedro H. F. Holanda and Bruno Guilherme and Jo{\~a}o Paulo V. Cardoso and Olga Goussevskaia and Ana Paula {Couto Da Silva}", title = "Mixtape: Using Real-Time User Feedback to Navigate Large Media Collections", journal = j-TOMM, volume = "13", number = "4", pages = "50:1--50:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3105969", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this work, we explore the increasing demand for novel user interfaces to navigate large media collections. We implement a geometric data structure to store and retrieve item-to-item similarity information and propose a novel navigation framework that uses vector operations and real-time user feedback to direct the outcome. The framework is scalable to large media collections and is suitable for computationally constrained devices. In particular, we implement this framework in the domain of music. To evaluate the effectiveness of the navigation process, we propose an automatic evaluation framework, based on synthetic user profiles, which allows us to quickly simulate and compare navigation paths using different algorithms and datasets. Moreover, we perform a real user study. To do that, we developed and launched Mixtape, a simple web application that allows users to create playlists by providing real-time feedback through liking and skipping patterns.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yakubu:2017:SSN, author = "Abukari M. Yakubu and Namunu C. Maddage and Pradeep K. Atrey", title = "Securing Speech Noise Reduction in Outsourced Environment", journal = j-TOMM, volume = "13", number = "4", pages = "51:1--51:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3105970", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud data centers (CDCs) are becoming a cost-effective method for processing and storage of multimedia data including images, video, and audio. Since CDCs are physically located in different jurisdictions, and are managed by external parties, data security is a growing concern. Data encryption at CDCs is commonly practiced to improve data security. However, to process the data at CDCs, data must often be decrypted, which raises issues in security. Thus, there is a growing demand for data processing techniques in encrypted domain in such an outsourced environment. In this article, we analyze encrypted domain speech content processing techniques for noise reduction. Noise contaminates speech during transmission or during the acquisition process by recording. As a result, the quality of the speech content is degraded. We apply Shamir's secret sharing as the cryptosystem to encrypt speech data before uploading it to a CDC. We then propose finite impulse response digital filters to reduce white and wind noise in the speech in the encrypted domain. We prove that our proposed schemes meet the security requirements of efficiency, accuracy, and checkability for both semi-honest and malicious adversarial models. Experimental results show that our proposed filtering techniques for speech noise reduction in the encrypted domain produce similar results when compared to plaintext domain processing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Guerrini:2017:IFR, author = "Fabrizio Guerrini and Nicola Adami and Sergio Benini and Alberto Piacenza and Julie Porteous and Marc Cavazza and Riccardo Leonardi", title = "Interactive Film Recombination", journal = j-TOMM, volume = "13", number = "4", pages = "52:1--52:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3103241", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we discuss an innovative media entertainment application called Interactive Movietelling. As an offspring of Interactive Storytelling applied to movies, we propose to integrate narrative generation through artificial intelligence (AI) planning with video processing and modeling to construct filmic variants starting from the baseline content. The integration is possible thanks to content description using semantic attributes pertaining to intermediate-level concepts shared between video processing and planning levels. The output is a recombination of segments taken from the input movie performed so as to convey an alternative plot. User tests on the prototype proved how promising Interactive Movietelling might be, even if it was designed at a proof of concept level. Possible improvements that are suggested here lead to many challenging research issues.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhou:2017:CCB, author = "Mingliang Zhou and Yongfei Zhang and Bo Li and Xupeng Lin", title = "Complexity Correlation-Based {CTU}-Level Rate Control with Direction Selection for {HEVC}", journal = j-TOMM, volume = "13", number = "4", pages = "53:1--53:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3107616", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Rate control is a crucial consideration in high-efficiency video coding (HEVC). The estimation of model parameters is very important for coding tree unit (CTU)-level rate control, as it will significantly affect bit allocation and thus coding performance. However, the model parameters in the CTU-level rate control sometimes fails because of inadequate consideration of the correlation between model parameters and complexity characteristic. In this study, we establish a novel complexity correlation-based CTU-level rate control for HEVC. First, we formulate the model parameter estimation scheme as a multivariable estimation problem; second, based on the complexity correlation of the neighbouring CTU, an optimal direction is selected in five directions for reference CTU set selection during model parameter estimation to further improve the prediction accuracy of the complexity of the current CTU. Third, to improve their precision, the relationship between the model parameters and the complexity of the reference CTU set in the optimal direction is established by using least square method (LS), and the model parameters are solved via the estimated complexity of the current CTU. Experimental results show that the proposed algorithm can significantly improve the accuracy of the CTU-level rate control and thus the coding performance; the proposed scheme consistently outperforms HM 16.0 and other state-of-the-art algorithms in a variety of testing configurations. More specifically, up to 8.4\% and on average 6.4\% BD-Rate reduction is achieved compared to HM 16.0 and up to 4.7\% and an average of 3.4\% BD-Rate reduction is achieved compared to other algorithms, with only a slight complexity overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sharrab:2017:MAP, author = "Yousef O. Sharrab and Nabil J. Sarhan", title = "Modeling and Analysis of Power Consumption in Live Video Streaming Systems", journal = j-TOMM, volume = "13", number = "4", pages = "54:1--54:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3115505", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article develops an aggregate power consumption model for live video streaming systems, including many-to-many systems. In many-to-one streaming systems, multiple video sources (i.e., cameras and/or sensors) stream videos to a monitoring station. We model the power consumed by the video sources in the capturing, encoding, and transmission phases and then provide an overall model in terms of the main capturing and encoding parameters, including resolution, frame rate, number of reference frames, motion estimation range, and quantization. We also analyze the power consumed by the monitoring station due to receiving, decoding, and upscaling the received video streams. In addition to modeling the power consumption, we model the achieved bitrate of video encoding. We validate the developed models through extensive experiments using two types of systems and different video contents. Furthermore, we analyze many-to-one systems in terms of bitrate, video quality, and the power consumed by the sources, as well as that by the monitoring station, considering the impacts of multiple parameters simultaneously.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ng:2017:WSD, author = "Pai Chet Ng and James She and Kang Eun Jeon and Matthias Baldauf", title = "When Smart Devices Interact With Pervasive Screens: a Survey", journal = j-TOMM, volume = "13", number = "4", pages = "55:1--55:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3115933", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The meeting of pervasive screens and smart devices has witnessed the birth of screen-smart device interaction (SSI), a key enabler to many novel interactive use cases. Most current surveys focus on direct human-screen interaction, and to the best of our knowledge, none have studied state-of-the-art SSI. This survey identifies three core elements of SSI and delivers a timely discussion on SSI oriented around the screen, the smart device, and the interaction modality. Two evaluation metrics (i.e., interaction latency and accuracy) have been adopted and refined to match the evaluation criterion of SSI. The bottlenecks that hinder the further advancement of the current SSI in connection with this metrics are studied. Last, future research challenges and opportunities are highlighted in the hope of inspiring continuous research efforts to realize the next generation of SSI.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Franti:2017:MMO, author = "Pasi Fr{\"a}nti and Radu Mariescu-Istodor and Lahari Sengupta", title = "{O-Mopsi}: Mobile Orienteering Game for Sightseeing, Exercising, and Education", journal = j-TOMM, volume = "13", number = "4", pages = "56:1--56:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3115935", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Location-based games have been around already since 2000 but only recently when PokemonGo came to markets it became clear that they can reach wide popularity. In this article, we perform a literature-based analytical study of what kind of issues location-based game design faces, and how they can be solved. We study how to use and verify the location, the role of the games as exergames, use in education, and study technical and safety issues. As a case study, we present O-Mopsi game that combines physical activity with problem solving. It includes three challenges: (1) navigating to the next target, (2) deciding the order of targets, (3) physical movement. All of them are unavoidable and relevant. For guiding the players, we use three types of multimedia: images (targets and maps), sound (user guidance), and GPS (for positioning). We discuss motivational aspects, analysis of the playing, and content creation. The quality of experiences is reported based on playing in SciFest Science festivals during 2011--2016.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Messaoudi:2017:PAG, author = "Farouk Messaoudi and Adlen Ksentini and Gwendal Simon and Philippe Bertin", title = "Performance Analysis of Game Engines on Mobile and Fixed Devices", journal = j-TOMM, volume = "13", number = "4", pages = "57:1--57:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3115934", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mobile gaming is an emerging concept wherein gamers are using mobile devices, like smartphones and tablets, to play best-seller games. Compared to dedicated gaming boxes or PCs, these devices still fall short of executing newly complex 3D video games with a rich immersion. Three novel solutions, relying on cloud computing infrastructure, namely, computation offloading, cloud gaming, and client-server architecture, will represent the next generation of game engine architecture aiming at improving the gaming experience. The basis of these aforementioned solutions is the distribution of the game code over different devices (including set-top boxes, PCs, and servers). In order to know how the game code should be distributed, advanced knowledge of game engines is required. By consequence, dissecting and analyzing game engine performances will surely help to better understand how to move in these new directions (i.e., distribute game code), which is so far missing in the literature. Aiming at filling this gap, we propose in this article to analyze and evaluate one of the famous engines in the market, that is, ``Unity 3D.'' We begin by detailing the architecture and the game logic of game engines. Then, we propose a test-bed to evaluate the CPU and GPU consumption per frame and per module for nine representative games on three platforms, namely, a stand-alone computer, embedded systems, and web players. Based on the obtained results and observations, we build a valued graph of each module, composing the Unity 3D architecture, which reflects the internal flow and CPU consumption. Finally, we made a comparison in terms of CPU consumption between these architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheung:2017:ECF, author = "Ming Cheung and Xiaopeng Li and James She", title = "An Efficient Computation Framework for Connection Discovery using Shared Images", journal = j-TOMM, volume = "13", number = "4", pages = "58:1--58:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3115951", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the advent and popularity of the social network, social graphs become essential to improve services and information relevance to users for many social media applications to predict follower/followee relationship, community membership, and so on. However, the social graphs could be hidden by users due to privacy concerns or kept by social media. Recently, connections discovered from user-shared images using machine-generated labels are proved to be more accessible alternatives to social graphs. But real-time discovery is difficult due to high complexity, and many applications are not possible. This article proposes an efficient computation framework for connection discovery using user-shared images, which is suitable for any image processing and computer vision techniques for connection discovery on the fly. The framework includes the architecture of online computation to facilitate real-time processing, offline computation for a complete processing, and online/offline communication. The proposed framework is implemented to demonstrate its effectiveness by speeding up connection discovery through user-shared images. By studying 300K+ user-shared images from two popular social networks, it is proven that the proposed computation framework reduces 90\% of runtime with a comparable accurate with existing frameworks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2017:DSF, author = "Xiaopeng Li and Ming Cheung and James She", title = "A Distributed Streaming Framework for Connection Discovery Using Shared Videos", journal = j-TOMM, volume = "13", number = "4", pages = "59:1--59:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3120996", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the advances in mobile devices and the popularity of social networks, users can share multimedia content anytime, anywhere. One of the most important types of emerging content is video, which is commonly shared on platforms such as Instagram and Facebook. User connections, which indicate whether two users are follower/followee or have the same interests, are essential to improve services and information relevant to users for many social media applications. But they are normally hidden due to users' privacy concerns or are kept confidential by social media sites. Using user-shared content is an alternative way to discover user connections. This article proposes to use user-shared videos for connection discovery with the Bag of Feature Tagging method and proposes a distributed streaming computation framework to facilitate the analytics. Exploiting the uniqueness of shared videos, the proposed framework is divided into Streaming processing and Online and Offline Computation. With experiments using a dataset from Twitter, it has been proved that the proposed method using user-shared videos for connection discovery is feasible. And the proposed computation framework significantly accelerates the analytics, reducing the processing time to only 32\% for follower/followee recommendation. It has also been proved that comparable performance can be achieved with only partial data for each video and leads to more efficient computation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{DeBoer:2017:SRZ, author = "Maaike H. T. {De Boer} and Yi-Jie Lu and Hao Zhang and Klamer Schutte and Chong-Wah Ngo and Wessel Kraaij", title = "Semantic Reasoning in Zero Example Video Event Retrieval", journal = j-TOMM, volume = "13", number = "4", pages = "60:1--60:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3131288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Searching in digital video data for high-level events, such as a parade or a car accident, is challenging when the query is textual and lacks visual example images or videos. Current research in deep neural networks is highly beneficial for the retrieval of high-level events using visual examples, but without examples it is still hard to (1) determine which concepts are useful to pre-train ( Vocabulary challenge ) and (2) which pre-trained concept detectors are relevant for a certain unseen high-level event ( Concept Selection challenge ). In our article, we present our Semantic Event Retrieval System which (1) shows the importance of high-level concepts in a vocabulary for the retrieval of complex and generic high-level events and (2) uses a novel concept selection method ( i-w2v ) based on semantic embeddings. Our experiments on the international TRECVID Multimedia Event Detection benchmark show that a diverse vocabulary including high-level concepts improves performance on the retrieval of high-level events in videos and that our novel method outperforms a knowledge-based concept selection method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Guo:2017:EMD, author = "Jianting Guo and Peijia Zheng and Jiwu Huang", title = "An Efficient Motion Detection and Tracking Scheme for Encrypted Surveillance Videos", journal = j-TOMM, volume = "13", number = "4", pages = "61:1--61:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3131342", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Performing detection on surveillance videos contributes significantly to the goals of safety and security. However, performing detection on unprotected surveillance video may reveal the privacy of innocent people in the video. Therefore, striking a proper balance between maintaining personal privacy while enhancing the feasibility of detection is an important issue. One promising solution to this problem is to encrypt the surveillance videos and perform detection on the encrypted videos. Most existing encrypted signal processing methods focus on still images or small data volumes; however, because videos are typically much larger, investigating how to process encrypted videos is a significant challenge. In this article, we propose an efficient motion detection and tracking scheme for encrypted H.264/AVC video bitstreams, which does not require the previous decryption on the encrypted video. The main idea is to first estimate motion information from the bitstream structure and codeword length and, then, propose a region update (RU) algorithm to deal with the loss and error drifting of motion caused by the video encryption. The RU algorithm is designed based on the prior knowledge that the object motion in the video is continuous in space and time. Compared to the existing scheme, which is based on video encryption that occurs at the pixel level, the proposed scheme has the advantages of requiring only a small storage of the encrypted video and has a low computational cost for both encryption and detection. Experimental results show that our scheme performs better regarding detection accuracy and execution speed. Moreover, the proposed scheme can work with more than one format-compliant video encryption method, provided that the positions of the macroblocks can be extracted from the encrypted video bitstream. Due to the coupling of video stream encryption and detection algorithms, our scheme can be directly connected to the video stream output (e.g., surveillance cameras) without requiring any camera modifications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Motamedi:2017:PPF, author = "Mohammad Motamedi and Philipp Gysel and Soheil Ghiasi", title = "{PLACID}: a Platform for {FPGA}-Based Accelerator Creation for {DCNNs}", journal = j-TOMM, volume = "13", number = "4", pages = "62:1--62:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3131289", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Dec 23 10:49:23 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep Convolutional Neural Networks (DCNNs) exhibit remarkable performance in a number of pattern recognition and classification tasks. Modern DCNNs involve many millions of parameters and billions of operations. Inference using such DCNNs, if implemented as software running on an embedded processor, results in considerable execution time and energy consumption, which is prohibitive in many mobile applications. Field-programmable gate array (FPGA)-based acceleration of DCNN inference is a promising approach to improve both energy consumption and classification throughput. However, the engineering effort required for development and verification of an optimized FPGA-based architecture is significant. In this article, we present PLACID, an automated PLatform for Accelerator CreatIon for DCNNs. PLACID uses an analytical approach to characterization and exploration of the implementation space. PLACID enables generation of an accelerator with the highest throughput for a given DCNN on a specific target FPGA platform. Subsequently, it generates an RTL level architecture in Verilog, which can be passed onto commercial tools for FPGA implementation. PLACID is fully automated, and reduces the accelerator design time from a few months down to a few hours. Experimental results show that architectures synthesized by PLACID yield 2$ \times $ higher throughput density than the best competing approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Akputu:2018:ERU, author = "Oryina Kingsley Akputu and Kah Phooi Seng and Yunli Lee and Li-Minn Ang", title = "Emotion Recognition Using Multiple Kernel Learning toward E-learning Applications", journal = j-TOMM, volume = "14", number = "1", pages = "1:1--1:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3131287", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Adaptive Educational Hypermedia (AEH) e-learning models aim to personalize educational content and learning resources based on the needs of an individual learner. The Adaptive Hypermedia Architecture (AHA) is a specific implementation of the AEH model that exploits the cognitive characteristics of learner feedback to adapt resources accordingly. However, beside cognitive feedback, the learning realm generally includes both the affective and emotional feedback of the learner, which is often neglected in the design of e-learning models. This article aims to explore the potential of utilizing affect or emotion recognition research in AEH models. The framework is referred to as Multiple Kernel Learning Decision Tree Weighted Kernel Alignment (MKLDT-WFA). The MKLDT-WFA has two merits over classical MKL. First, the WFA component only preserves the relevant kernel weights to reduce redundancy and improve the discrimination for emotion classes. Second, training via the decision tree reduces the misclassification issues associated with the SimpleMKL. The proposed work has been evaluated on different emotion datasets and the results confirm the good performances. Finally, the conceptual Emotion-based E-learning Model (EEM) with the proposed emotion recognition framework is proposed for future work.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2018:LLP, author = "Kai Li and Guo-Jun Qi and Kien A. Hua", title = "Learning Label Preserving Binary Codes for Multimedia Retrieval: a General Approach", journal = j-TOMM, volume = "14", number = "1", pages = "2:1--2:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152126", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Learning-based hashing has been researched extensively in the past few years due to its great potential in fast and accurate similarity search among huge volumes of multimedia data. In this article, we present a novel multimedia hashing framework, called Label Preserving Multimedia Hashing (LPMH) for multimedia similarity search. In LPMH, a general optimization method is used to learn the joint binary codes of multiple media types by explicitly preserving semantic label information. Compared with existing hashing methods which are typically developed under and thus restricted to some specific objective functions, the proposed optimization strategy is not tied to any specific loss function and can easily incorporate bit balance constraints to produce well-balanced binary codes. Specifically, our formulation leads to a set of Binary Integer Programming (BIP) problems that have exact solutions both with and without bit balance constraints. These problems can be solved extremely fast and the solution can easily scale up to large-scale datasets. In the hash function learning stage, the boosted decision trees algorithm is utilized to learn multiple media-specific hash functions that can map heterogeneous data sources into a homogeneous Hamming space for cross-media retrieval. We have comprehensively evaluated the proposed method using a range of large-scale datasets in both single-media and cross-media retrieval tasks. The experimental results demonstrate that LPMH is competitive with state-of-the-art methods in both speed and accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ceballos:2018:IEC, author = "Rodrigo Ceballos and Beatrice Ionascu and Wanjoo Park and Mohamad Eid", title = "Implicit Emotion Communication: {EEG} Classification and Haptic Feedback", journal = j-TOMM, volume = "14", number = "1", pages = "3:1--3:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152128", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Today, ubiquitous digital communication systems do not have an intuitive, natural way of communicating emotion, which, in turn, affects the degree to which humans can emotionally connect and interact with one another. To address this problem, a more natural, intuitive, and implicit emotion communication system was designed and created that employs asymmetry-based EEG emotion classification for detecting the emotional state of the sender and haptic feedback (in the form of tactile gestures) for displaying emotions for a receiver. Emotions are modeled in terms of valence (positive/negative emotions) and arousal (intensity of the emotion). Performance analysis shows that the proposed EEG subject-dependent emotion classification model with Free Asymmetry features allows for more flexible feature-generation schemes than other existing algorithms and attains an average accuracy of 92.5\% for valence and 96.5\% for arousal, outperforming previous-generation schemes in high feature space. As for the haptic feedback, a tactile gesture authoring tool and a haptic jacket were developed to design tactile gestures that can intensify emotional reactions in terms of valence and arousal. Experimental study demonstrated that subject-independent emotion transmission through tactile gestures is effective for the arousal dimension of an emotion but is less effective for valence. Consistency in subject-dependent responses for both valence and arousal suggests that personalized tactile gestures would be more effective.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2018:DAQ, author = "Jiyan Wu and Bo Cheng and Yuan Yang and Ming Wang and Junliang Chen", title = "Delay-Aware Quality Optimization in Cloud-Assisted Video Streaming System", journal = j-TOMM, volume = "14", number = "1", pages = "4:1--4:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152116", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud-assisted video streaming has emerged as a new paradigm to optimize multimedia content distribution over the Internet. This article investigates the problem of streaming cloud-assisted real-time video to multiple destinations (e.g., cloud video conferencing, multi-player cloud gaming, etc.) over lossy communication networks. The user diversity and network dynamics result in the delay differences among multiple destinations. This research proposes Differentiated cloud-Assisted VIdeo Streaming (DAVIS) framework, which proactively leverages such delay differences in video coding and transmission optimization. First, we analytically formulate the optimization problem of joint coding and transmission to maximize received video quality. Second, we develop a quality optimization framework that integrates the video representation selection and FEC (Forward Error Correction) packet interleaving. The proposed DAVIS is able to effectively perform differentiated quality optimization for multiple destinations by taking advantage of the delay differences in cloud-assisted video streaming system. We conduct the performance evaluation through extensive experiments with the Amazon EC2 instances and Exata emulation platform. Evaluation results show that DAVIS outperforms the reference cloud-assisted streaming solutions in video quality and delay performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jiang:2018:DBC, author = "Shuhui Jiang and Yue Wu and Yun Fu", title = "Deep Bidirectional Cross-Triplet Embedding for Online Clothing Shopping", journal = j-TOMM, volume = "14", number = "1", pages = "5:1--5:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152114", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we address the cross-domain (i.e., street and shop) clothing retrieval problem and investigate its real-world applications for online clothing shopping. It is a challenging problem due to the large discrepancy between street and shop domain images. We focus on learning an effective feature-embedding model to generate robust and discriminative feature representation across domains. Existing triplet embedding models achieve promising results by finding an embedding metric in which the distance between negative pairs is larger than the distance between positive pairs plus a margin. However, existing methods do not address the challenges in the cross-domain clothing retrieval scenario sufficiently. First, the intradomain and cross-domain data relationships need to be considered simultaneously. Second, the number of matched and nonmatched cross-domain pairs are unbalanced. To address these challenges, we propose a deep cross-triplet embedding algorithm together with a cross-triplet sampling strategy. The extensive experimental evaluations demonstrate the effectiveness of the proposed algorithms well. Furthermore, we investigate two novel online shopping applications, clothing trying on and accessories recommendation, based on a unified cross-domain clothing retrieval framework.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2018:DFI, author = "Peisong Wang and Qinghao Hu and Zhiwei Fang and Chaoyang Zhao and Jian Cheng", title = "{DeepSearch}: a Fast Image Search Framework for Mobile Devices", journal = j-TOMM, volume = "14", number = "1", pages = "6:1--6:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152127", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Content-based image retrieval (CBIR) is one of the most important applications of computer vision. In recent years, there have been many important advances in the development of CBIR systems, especially Convolutional Neural Networks (CNNs) and other deep-learning techniques. On the other hand, current CNN-based CBIR systems suffer from high computational complexity of CNNs. This problem becomes more severe as mobile applications become more and more popular. The current practice is to deploy the entire CBIR systems on the server side while the client side only serves as an image provider. This architecture can increase the computational burden on the server side, which needs to process thousands of requests per second. Moreover, sending images have the potential of personal information leakage. As the need of mobile search expands, concerns about privacy are growing. In this article, we propose a fast image search framework, named DeepSearch, which makes complex image search based on CNNs feasible on mobile phones. To implement the huge computation of CNN models, we present a tensor Block Term Decomposition (BTD) approach as well as a nonlinear response reconstruction method to accelerate the CNNs involving in object detection and feature extraction. The extensive experiments on the ImageNet dataset and Alibaba Large-scale Image Search Challenge dataset show that the proposed accelerating approach BTD can significantly speed up the CNN models and further makes CNN-based image search practical on common smart phones.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2018:RMV, author = "Sicong Liu and Silvestro Roberto Poccia and K. Sel{\c{c}}uk Candan and Maria Luisa Sapino and Xiaolan Wang", title = "Robust Multi-Variate Temporal Features of Multi-Variate Time Series", journal = j-TOMM, volume = "14", number = "1", pages = "7:1--7:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Many applications generate and/or consume multi-variate temporal data, and experts often lack the means to adequately and systematically search for and interpret multi-variate observations. In this article, we first observe that multi-variate time series often carry localized multi-variate temporal features that are robust against noise. We then argue that these multi-variate temporal features can be extracted by simultaneously considering, at multiple scales, temporal characteristics of the time series along with external knowledge, including variate relationships that are known a priori. Relying on these observations, we develop data models and algorithms to detect robust multi-variate temporal (RMT) features that can be indexed for efficient and accurate retrieval and can be used for supporting data exploration and analysis tasks. Experiments confirm that the proposed RMT algorithm is highly effective and efficient in identifying robust multi-scale temporal features of multi-variate time series.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Guo:2018:OEL, author = "Dan Guo and Wengang Zhou and Houqiang Li and Meng Wang", title = "Online Early-Late Fusion Based on Adaptive {HMM} for Sign Language Recognition", journal = j-TOMM, volume = "14", number = "1", pages = "8:1--8:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152121", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In sign language recognition (SLR) with multimodal data, a sign word can be represented by multiply features, for which there exist an intrinsic property and a mutually complementary relationship among them. To fully explore those relationships, we propose an online early-late fusion method based on the adaptive Hidden Markov Model (HMM). In terms of the intrinsic property, we discover that inherent latent change states of each sign are related not only to the number of key gestures and body poses but also to their translation relationships. We propose an adaptive HMM method to obtain the hidden state number of each sign by affinity propagation clustering. For the complementary relationship, we propose an online early-late fusion scheme. The early fusion (feature fusion) is dedicated to preserving useful information to achieve a better complementary score, while the late fusion (score fusion) uncovers the significance of those features and aggregates them in a weighting manner. Different from classical fusion methods, the fusion is query adaptive. For different queries, after feature selection (including the combined feature), the fusion weight is inversely proportional to the area under the curve of the normalized query score list for each selected feature. The whole fusion process is effective and efficient. Experiments verify the effectiveness on the signer-independent SLR with large vocabulary. Compared either on different dataset sizes or to different SLR models, our method demonstrates consistent and promising performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2018:JEA, author = "Huei-Fang Yang and Bo-Yao Lin and Kuang-Yu Chang and Chu-Song Chen", title = "Joint Estimation of Age and Expression by Combining Scattering and Convolutional Networks", journal = j-TOMM, volume = "14", number = "1", pages = "9:1--9:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152118", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article tackles the problem of joint estimation of human age and facial expression. This is an important yet challenging problem because expressions can alter face appearances in a similar manner to human aging. Different from previous approaches that deal with the two tasks independently, our approach trains a convolutional neural network (CNN) model that unifies ordinal regression and multi-class classification in a single framework. We demonstrate experimentally that our method performs more favorably against state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2018:EHD, author = "Shao Huang and Weiqiang Wang and Shengfeng He and Rynson W. H. Lau", title = "Egocentric Hand Detection Via Dynamic Region Growing", journal = j-TOMM, volume = "14", number = "1", pages = "10:1--10:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152129", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Egocentric videos, which mainly record the activities carried out by the users of wearable cameras, have drawn much research attention in recent years. Due to its lengthy content, a large number of ego-related applications have been developed to abstract the captured videos. As the users are accustomed to interacting with the target objects using their own hands, while their hands usually appear within their visual fields during the interaction, an egocentric hand detection step is involved in tasks like gesture recognition, action recognition, and social interaction understanding. In this work, we propose a dynamic region-growing approach for hand region detection in egocentric videos, by jointly considering hand-related motion and egocentric cues. We first determine seed regions that most likely belong to the hand, by analyzing the motion patterns across successive frames. The hand regions can then be located by extending from the seed regions, according to the scores computed for the adjacent superpixels. These scores are derived from four egocentric cues: contrast, location, position consistency, and appearance continuity. We discuss how to apply the proposed method in real-life scenarios, where multiple hands irregularly appear and disappear from the videos. Experimental results on public datasets show that the proposed method achieves superior performance compared with the state-of-the-art methods, especially in complicated scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wen:2018:VBR, author = "Jiqing Wen and James She and Xiaopeng Li and Hui Mao", title = "Visual Background Recommendation for Dance Performances Using Deep Matrix Factorization", journal = j-TOMM, volume = "14", number = "1", pages = "11:1--11:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3152463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The stage background is one of the most important features for a dance performance, as it helps to create the scene and atmosphere. In conventional dance performances, the background images are usually selected or designed by professional stage designers according to the theme and the style of the dance. In new media dance performances, the stage effects are usually generated by media editing software. Selecting or producing a dance background is quite challenging and is generally carried out by skilled technicians. The goal of the research reported in this article is to ease this process. Instead of searching for background images from the sea of available resources, dancers are recommended images that they are more likely to use. This work proposes the idea of a novel system to recommend images based on content-based social computing. The core part of the system is a probabilistic prediction model to predict a dancer's interests in candidate images through social platforms. Different from traditional collaborative filtering or content-based models, the model proposed here effectively combines a dancer's social behaviors (rating action, click action, etc.) with the visual content of images shared by the dancer using deep matrix factorization (DMF). With the help of such a system, dancers can select from the recommended images and set them as the backgrounds of their dance performances through a media editor. According to the experiment results, the proposed DMF model outperforms the previous methods, and when the dataset is very sparse, the proposed DMF model shows more significant results.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pan:2018:AFP, author = "Zhaoqing Pan and Jianjun Lei and Yajuan Zhang and Fu Lee Wang", title = "Adaptive Fractional-Pixel Motion Estimation Skipped Algorithm for Efficient {HEVC} Motion Estimation", journal = j-TOMM, volume = "14", number = "1", pages = "12:1--12:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3159170", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "High-Efficiency Video Coding (HEVC) efficiently addresses the storage and transmit problems of high-definition videos, especially for 4K videos. The variable-size Prediction Units (PUs)--based Motion Estimation (ME) contributes a significant compression rate to the HEVC encoder and also generates a huge computation load. Meanwhile, high-level encoding complexity prevents widespread adoption of the HEVC encoder in multimedia systems. In this article, an adaptive fractional-pixel ME skipped scheme is proposed for low-complexity HEVC ME. First, based on the property of the variable-size PUs--based ME process and the video content partition relationship among variable-size PUs, all inter-PU modes during a coding unit encoding process are classified into root-type PU mode and children-type PU modes. Then, according to the ME result of the root-type PU mode, the fractional-pixel ME of its children-type PU modes is adaptively skipped. Simulation results show that, compared to the original ME in HEVC reference software, the proposed algorithm reduces ME encoding time by an average of 63.22\% while encoding efficiency performance is maintained.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zheng:2018:DLC, author = "Zhedong Zheng and Liang Zheng and Yi Yang", title = "A Discriminatively Learned {CNN} Embedding for Person Reidentification", journal = j-TOMM, volume = "14", number = "1", pages = "13:1--13:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3159171", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we revisit two popular convolutional neural networks in person re-identification (re-ID): verification and identification models. The two models have their respective advantages and limitations due to different loss functions. Here, we shed light on how to combine the two models to learn more discriminative pedestrian descriptors. Specifically, we propose a Siamese network that simultaneously computes the identification loss and verification loss. Given a pair of training images, the network predicts the identities of the two input images and whether they belong to the same identity. Our network learns a discriminative embedding and a similarity measurement at the same time, thus taking full usage of the re-ID annotations. Our method can be easily applied on different pretrained networks. Albeit simple, the learned embedding improves the state-of-the-art performance on two public person re-ID benchmarks. Further, we show that our architecture can also be applied to image retrieval. The code is available at \url{https://github.com/layumi/2016_person_re-ID}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sun:2018:RPP, author = "Weiwei Sun and Jiantao Zhou and Shuyuan Zhu and Yuan Yan Tang", title = "Robust Privacy-Preserving Image Sharing over Online Social Networks {(OSNs)}", journal = j-TOMM, volume = "14", number = "1", pages = "14:1--14:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3165265", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 16 18:18:12 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sharing images online has become extremely easy and popular due to the ever-increasing adoption of mobile devices and online social networks (OSNs). The privacy issues arising from image sharing over OSNs have received significant attention in recent years. In this article, we consider the problem of designing a secure, robust, high-fidelity, storage-efficient image-sharing scheme over Facebook, a representative OSN that is widely accessed. To accomplish this goal, we first conduct an in-depth investigation on the manipulations that Facebook performs to the uploaded images. Assisted by such knowledge, we propose a DCT-domain image encryption/decryption framework that is robust against these lossy operations. As verified theoretically and experimentally, superior performance in terms of data privacy, quality of the reconstructed images, and storage cost can be achieved.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Berretti:2018:IAS, author = "Stefano Berretti", title = "Improved Audio Steganalytic Feature and Its Applications in Audio Forensics", journal = j-TOMM, volume = "14", number = "2", pages = "43:1--43:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190575", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Digital multimedia steganalysis has attracted wide attention over the past decade. Currently, there are many algorithms for detecting image steganography. However, little research has been devoted to audio steganalysis. Since the statistical properties of image and audio files are quite different, features that are effective in image steganalysis may not be effective for audio. In this article, we design an improved audio steganalytic feature set derived from both the time and Mel-frequency domains for detecting some typical steganography in the time domain, including LSB matching, Hide4PGP, and Steghide. The experiment results, evaluated on different audio sources, including various music and speech clips of different complexity, have shown that the proposed features significantly outperform the existing ones. Moreover, we use the proposed features to detect and further identify some typical audio operations that would probably be used in audio tampering. The extensive experiment results have shown that the proposed features also outperform the related forensic methods, especially when the length of the audio clip is small, such as audio clips with 800 samples. This is very important in real forensic situations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gupta:2018:AGM, author = "Abhinav Gupta and Divya Singhal", title = "Analytical Global Median Filtering Forensics Based on Moment Histograms", journal = j-TOMM, volume = "14", number = "2", pages = "44:1--44:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3176650", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Median filtering forensics in images has gained wide attention from researchers in recent years because of its inherent nature of preserving visual traces. Although many forensic methods are developed for median filtering detection, probability of detection reduces under JPEG compression at low-quality factors and for low-resolution images. The feature set reduction is also a challenging issue among existing detectors. In this article, a 19-dimensional feature set is analytically derived from image skewness and kurtosis histograms. This new feature set is exploited for the purpose of global median filtering forensics and verified with exhaustive experimental results. The efficacy of the method is tested on six popular databases (UCID, BOWS2, BOSSBase, NRCS, RAISE, and DID) and found that the new feature set uncovers filtering traces for moderate, low JPEG post-compression and low-resolution operation. Our proposed method yields lowest probability of error and largest area under the ROC curve for most of the test cases in comparison with previous approaches. Some novel test cases are introduced to thoroughly assess the benefits and limitations of the proposed method. The obtained results indicate that the proposed method would provide an important tool to the field of passive image forensics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2018:MSH, author = "Min Huang and Song-Zhi Su and Hong-Bo Zhang and Guo-Rong Cai and Dongying Gong and Donglin Cao and Shao-Zi Li", title = "Multifeature Selection for {$3$D} Human Action Recognition", journal = j-TOMM, volume = "14", number = "2", pages = "45:1--45:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177757", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In mainstream approaches for 3D human action recognition, depth and skeleton features are combined to improve recognition accuracy. However, this strategy results in high feature dimensions and low discrimination due to redundant feature vectors. To solve this drawback, a multi-feature selection approach for 3D human action recognition is proposed in this paper. First, three novel single-modal features are proposed to describe depth appearance, depth motion, and skeleton motion. Second, a classification entropy of random forest is used to evaluate the discrimination of the depth appearance based features. Finally, one of the three features is selected to recognize the sample according to the discrimination evaluation. Experimental results show that the proposed multi-feature selection approach significantly outperforms other approaches based on single-modal feature and feature fusion.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mazaheri:2018:LMC, author = "Amir Mazaheri and Boqing Gong and Mubarak Shah", title = "Learning a Multi-Concept Video Retrieval Model with Multiple Latent Variables", journal = j-TOMM, volume = "14", number = "2", pages = "46:1--46:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3176647", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Effective and efficient video retrieval has become a pressing need in the ``big video'' era. The objective of this work is to provide a principled model for computing the ranking scores of a video in response to one or more concepts, where the concepts could be directly supplied by users or inferred by the system from the user queries. Indeed, how to deal with multi-concept queries has become a central component in modern video retrieval systems that accept text queries. However, it has been long overlooked and simply implemented by weighted averaging of the corresponding concept detectors' scores. Our approach, which can be considered as a latent ranking SVM, integrates the advantages of various recent works in text and image retrieval, such as choosing ranking over structured prediction, modeling inter-dependencies between querying concepts, and so on. Videos consist of shots, and we use latent variables to account for the mutually complementary cues within and across shots. Concept labels of shots are scarce and noisy. We introduce a simple and effective technique to make our model robust to outliers. Our approach gives superior performance when it is tested on not only the queries seen at training but also novel queries, some of which consist of more concepts than the queries used for training.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tulilaulu:2018:DM, author = "Aurora Tulilaulu and Matti Nelimarkka and Joonas Paalasmaa and Daniel Johnson and Dan Ventura and Petri Myllys and Hannu Toivonen", title = "Data Musicalization", journal = j-TOMM, volume = "14", number = "2", pages = "47:1--47:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3184742", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Data musicalization is the process of automatically composing music based on given data as an approach to perceptualizing information artistically. The aim of data musicalization is to evoke subjective experiences in relation to the information rather than merely to convey unemotional information objectively. This article is written as a tutorial for readers interested in data musicalization. We start by providing a systematic characterization of musicalization approaches, based on their inputs, methods, and outputs. We then illustrate data musicalization techniques with examples from several applications: one that perceptualizes physical sleep data as music, several that artistically compose music inspired by the sleep data, one that musicalizes on-line chat conversations to provide a perceptualization of liveliness of a discussion, and one that uses musicalization in a gamelike mobile application that allows its users to produce music. We additionally provide a number of electronic samples of music produced by the different musicalization applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cornia:2018:PMA, author = "Marcella Cornia and Lorenzo Baraldi and Giuseppe Serra and Rita Cucchiara", title = "Paying More Attention to Saliency: Image Captioning with Saliency and Context Attention", journal = j-TOMM, volume = "14", number = "2", pages = "48:1--48:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177745", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image captioning has been recently gaining a lot of attention thanks to the impressive achievements shown by deep captioning architectures, which combine Convolutional Neural Networks to extract image representations and Recurrent Neural Networks to generate the corresponding captions. At the same time, a significant research effort has been dedicated to the development of saliency prediction models, which can predict human eye fixations. Even though saliency information could be useful to condition an image captioning architecture, by providing an indication of what is salient and what is not, research is still struggling to incorporate these two techniques. In this work, we propose an image captioning approach in which a generative recurrent neural network can focus on different parts of the input image during the generation of the caption, by exploiting the conditioning given by a saliency prediction model on which parts of the image are salient and which are contextual. We show, through extensive quantitative and qualitative experiments on large-scale datasets, that our model achieves superior performance with respect to captioning baselines with and without saliency and to different state-of-the-art approaches combining saliency and captioning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wen:2018:CEE, author = "Longyin Wen and Honggang Qi and Siwei Lyu", title = "Contrast Enhancement Estimation for Digital Image Forensics", journal = j-TOMM, volume = "14", number = "2", pages = "49:1--49:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183518", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Inconsistency in contrast enhancement can be used to expose image forgeries. In this work, we describe a new method to estimate contrast enhancement operations from a single image. Our method takes advantage of the nature of contrast enhancement as a mapping between pixel values and the distinct characteristics it introduces to the image pixel histogram. Our method recovers the original pixel histogram and the contrast enhancement simultaneously from a single image with an iterative algorithm. Unlike previous works, our method is robust in the presence of additive noise perturbations that are used to hide the traces of contrast enhancement. Furthermore, we also develop an effective method to detect image regions undergone contrast enhancement transformations that are different from the rest of the image, and we use this method to detect composite images. We perform extensive experimental evaluations to demonstrate the efficacy and efficiency of our method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jiang:2018:DMP, author = "Yu-Gang Jiang and Minjun Li and Xi Wang and Wei Liu and Xian-Sheng Hua", title = "{DeepProduct}: Mobile Product Search With Portable Deep Features", journal = j-TOMM, volume = "14", number = "2", pages = "50:1--50:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3184745", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Features extracted by deep networks have been popular in many visual search tasks. This article studies deep network structures and training schemes for mobile visual search. The goal is to learn an effective yet portable feature representation that is suitable for bridging the domain gap between mobile user photos and (mostly) professionally taken product images while keeping the computational cost acceptable for mobile-based applications. The technical contributions are twofold. First, we propose an alternative of the contrastive loss popularly used for training deep Siamese networks, namely robust contrastive loss, where we relax the penalty on some positive and negative pairs to alleviate overfitting. Second, a simple multitask fine-tuning scheme is leveraged to train the network, which not only utilizes knowledge from the provided training photo pairs but also harnesses additional information from the large ImageNet dataset to regularize the fine-tuning process. Extensive experiments on challenging real-world datasets demonstrate that both the robust contrastive loss and the multitask fine-tuning scheme are effective, leading to very promising results with a time cost suitable for mobile product search scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ahmad:2018:EDM, author = "Kashif Ahmad and Mohamed Lamine Mekhalfi and Nicola Conci and Farid Melgani and Francesco {De Natale}", title = "Ensemble of Deep Models for Event Recognition", journal = j-TOMM, volume = "14", number = "2", pages = "51:1--51:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3199668", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we address the problem of recognizing an event from a single related picture. Given the large number of event classes and the limited information contained in a single shot, the problem is known to be particularly hard. To achieve a reliable detection, we propose a combination of multiple classifiers, and we compare three alternative strategies to fuse the results of each classifier, namely: (i) induced order weighted averaging operators, (ii) genetic algorithms, and (iii) particle swarm optimization. Each method is aimed at determining the optimal weights to be assigned to the decision scores yielded by different deep models, according to the relevant optimization strategy. Experimental tests have been performed on three event recognition datasets, evaluating the performance of various deep models, both alone and selectively combined. Experimental results demonstrate that the proposed approach outperforms traditional multiple classifier solutions based on uniform weighting, and outperforms recent state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2018:UER, author = "Wei Hu and Mozhdeh Seifi and Erik Reinhard", title = "Over- and Under-Exposure Reconstruction of a Single Plenoptic Capture", journal = j-TOMM, volume = "14", number = "2", pages = "52:1--52:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3199514", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Light field images, for example, taken with plenoptic cameras, offer interesting post-processing opportunities, including depth-of-field management, depth estimation, viewpoint selection, and 3D image synthesis. Like most capture devices, however, plenoptic cameras have a limited dynamic range, so that over- and under-exposed areas in plenoptic images are commonplace. We therefore present a straightforward and robust plenoptic reconstruction technique based on the observation that vignetting causes peripheral views to receive less light than central views. Thus, corresponding pixels in different views can be used to reconstruct illumination, especially in areas where information missing in one view is present in another. Our algorithm accurately reconstructs under- and over-exposed regions (known as declipping), additionally affording an increase in peak luminance by up to two f-stops, and a comparable lowering of the noise floor. The key advantages of this approach are that no hardware modifications are necessary to improve the dynamic range, that no multiple exposure techniques are required, and therefore that no ghosting or other artifacts are introduced.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Skorin-Kapov:2018:GES, author = "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias Ho{\ss}feld and Kuan-Ta Chen", title = "Guest Editorial: Special Issue on {``QoE Management for Multimedia Services''}", journal = j-TOMM, volume = "14", number = "2s", pages = "28:1--28:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3192332", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Skorin-Kapov:2018:SEC, author = "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias Ho{\ss}feld and Kuan-Ta Chen", title = "A Survey of Emerging Concepts and Challenges for {QoE} Management of Multimedia Services", journal = j-TOMM, volume = "14", number = "2s", pages = "29:1--29:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3176648", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Quality of Experience (QoE) has received much attention over the past years and has become a prominent issue for delivering services and applications. A significant amount of research has been devoted to understanding, measuring, and modelling QoE for a variety of media services. The next logical step is to actively exploit that accumulated knowledge to improve and manage the quality of multimedia services, while at the same time ensuring efficient and cost-effective network operations. Moreover, with many different players involved in the end-to-end service delivery chain, identifying the root causes of QoE impairments and finding effective solutions for meeting the end users' requirements and expectations in terms of service quality is a challenging and complex problem. In this article, we survey state-of-the-art findings and present emerging concepts and challenges related to managing QoE for networked multimedia services. Going beyond a number of previously published survey articles addressing the topic of QoE management, we address QoE management in the context of ongoing developments, such as the move to softwarized networks, the exploitation of big data analytics and machine learning, and the steady rise of new and immersive services (e.g., augmented and virtual reality). We address the implications of such paradigm shifts in terms of new approaches in QoE modeling and the need for novel QoE monitoring and management infrastructures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhu:2018:MIV, author = "Yi Zhu and Sharath Chandra Guntuku and Weisi Lin and Gheorghita Ghinea and Judith A. Redi", title = "Measuring Individual Video {QoE}: a Survey, and Proposal for Future Directions Using Social Media", journal = j-TOMM, volume = "14", number = "2s", pages = "30:1--30:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183512", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The next generation of multimedia services have to be optimized in a personalized way, taking user factors into account for the evaluation of individual experience. Previous works have investigated the influence of user factors mostly in a controlled laboratory environment which often includes a limited number of users and fails to reflect real-life environment. Social media, especially Facebook, provide an interesting alternative for Internet-based subjective evaluation. In this article, we develop (and open-source) a Facebook application, named YouQ$^1$, as an experimental platform for studying individual experience for videos. Our results show that subjective experiments based on YouQ can produce reliable results as compared to a controlled laboratory experiment. Additionally, YouQ has the ability to collect user information automatically from Facebook, which can be used for modeling individual experience.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Petrangeli:2018:QEC, author = "Stefano Petrangeli and Jeroen {Van Der Hooft} and Tim Wauters and Filip {De Turck}", title = "Quality of Experience-Centric Management of Adaptive Video Streaming Services: Status and Challenges", journal = j-TOMM, volume = "14", number = "2s", pages = "31:1--31:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3165266", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video streaming applications currently dominate Internet traffic. Particularly, HTTP Adaptive Streaming (HAS) has emerged as the dominant standard for streaming videos over the best-effort Internet, thanks to its capability of matching the video quality to the available network resources. In HAS, the video client is equipped with a heuristic that dynamically decides the most suitable quality to stream the content, based on information such as the perceived network bandwidth or the video player buffer status. The goal of this heuristic is to optimize the quality as perceived by the user, the so-called Quality of Experience (QoE). Despite the many advantages brought by the adaptive streaming principle, optimizing users' QoE is far from trivial. Current heuristics are still suboptimal when sudden bandwidth drops occur, especially in wireless environments, thus leading to freezes in the video playout, the main factor influencing users' QoE. This issue is aggravated in case of live events, where the player buffer has to be kept as small as possible in order to reduce the playout delay between the user and the live signal. In light of the above, in recent years, several works have been proposed with the aim of extending the classical purely client-based structure of adaptive video streaming, in order to fully optimize users' QoE. In this article, a survey is presented of research works on this topic together with a classification based on where the optimization takes place. This classification goes beyond client-based heuristics to investigate the usage of server- and network-assisted architectures and of new application and transport layer protocols. In addition, we outline the major challenges currently arising in the field of multimedia delivery, which are going to be of extreme relevance in future years.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bhat:2018:SNA, author = "Divyashri Bhat and Amr Rizk and Michael Zink and Ralf Steinmetz", title = "{SABR}: Network-Assisted Content Distribution for {QoE}-Driven {ABR} Video Streaming", journal = j-TOMM, volume = "14", number = "2s", pages = "32:1--32:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183516", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "State-of-the-art software-defined wide area networks (SD-WANs) provide the foundation for flexible and highly resilient networking. In this work, we design, implement, and evaluate a novel architecture (denoted as SABR) that leverages the benefits of software-defined networking (SDN) to provide network-assisted adaptive bitrate streaming. With clients retaining full control of their streaming algorithms, we clearly show that by this network assistance, both the clients and the content providers benefit significantly in terms of quality of experience (QoE) and content origin offloading. SABR utilizes information on available bandwidths per link and network cache contents to guide video streaming clients with the goal of improving the viewer's QoE. In addition, SABR uses SDN capabilities to dynamically program flows to optimize the utilization of content delivery network caches. Backed by our study of SDN-assisted streaming, we discuss the change in the requirements for network-to-player APIs that enables flexible video streaming. We illustrate the difficulty of the problem and the impact of SDN-assisted streaming on QoE metrics using various well-established player algorithms. We evaluate SABR together with state-of-the-art dynamic adaptive streaming over HTTP (DASH) quality adaptation algorithms through a series of experiments performed on a real-world, SDN-enabled testbed network with minimal modifications to an existing DASH client. In addition, we compare the performance of different caching strategies in combination with SABR. Our trace-based measurements show the substantial improvement in cache hit rates and QoE metrics in conjunction with SABR indicating a rich design space for jointly optimized SDN-assisted caching architectures for adaptive bitrate video streaming applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Burger:2018:GAV, author = "Valentin Burger and Thomas Zinner and Lam Dinh-Xuan and Florian Wamser and Phuoc Tran-Gia", title = "A Generic Approach to Video Buffer Modeling Using Discrete-Time Analysis", journal = j-TOMM, volume = "14", number = "2s", pages = "33:1--33:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183511", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The large share of traffic in the Internet generated by video streaming services puts high loads on access and aggregation networks, resulting in high costs for the content delivery infrastructure. To reduce the bandwidth consumed while maintaining a high playback quality, video players use policies that control and limit the buffer level by using thresholds for pausing and continuing the video download. This allows shaping the bandwidth consumed by video streams and limiting the traffic wasted in case of playback abortion. Especially in mobile scenarios, where the throughput can be highly variant, the buffer policy can have a high impact on the probability of interruptions during video playback. To find the optimal setting for the buffer policy in each network condition, the relationship between the parameters of the buffer policy, the network throughput dynamics, and the corresponding video playback behavior needs to be understood. To this end, we model the video buffer as GI/GI/1 queue with pq -policy using discrete-time analysis. By studying the stochastic properties of the buffer-level distribution, we are able to accurately evaluate the impact of network and video bitrate dynamics on the video playback quality based on the buffer policy. We find a fundamental relationship between the bandwidth variation and the expected interarrival time of segments, meaning that overproportionately more bandwidth is necessary to prevent stalling events for high bandwidth variation. The proposed model further allows to optimize the trade-off between the traffic wasted in case of video abortion and video streaming quality experienced by the user.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Siekkinen:2018:CYS, author = "Matti Siekkinen and Teemu k{\"a}m{\"a}r{\"a}inen and Leonardo Favario and Enrico Masala", title = "Can You See What {I} See? {Quality}-of-Experience Measurements of Mobile Live Video Broadcasting", journal = j-TOMM, volume = "14", number = "2s", pages = "34:1--34:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3165279", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Broadcasting live video directly from mobile devices is rapidly gaining popularity with applications like Periscope and Facebook Live. The quality of experience (QoE) provided by these services comprises many factors, such as quality of transmitted video, video playback stalling, end-to-end latency, and impact on battery life, and they are not yet well understood. In this article, we examine mainly the Periscope service through a comprehensive measurement study and compare it in some aspects to Facebook Live. We shed light on the usage of Periscope through analysis of crawled data and then investigate the aforementioned QoE factors through statistical analyses as well as controlled small-scale measurements using a couple of different smartphones and both versions, Android and iOS, of the two applications. We report a number of findings including the discrepancy in latency between the two most commonly used protocols, RTMP and HLS, surprising surges in bandwidth demand caused by the Periscope app's chat feature, substantial variations in video quality, poor adaptation of video bitrate to available upstream bandwidth at the video broadcaster side, and significant power consumption caused by the applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bruneau-Queyreix:2018:PNS, author = "Joachim Bruneau-Queyreix and Jordi Mongay Batalla and Mathias Lacaud and Daniel Negru", title = "{PMS}: a Novel Scale-Adaptive and Quality-Adaptive Hybrid {P2P\slash} Multisource Solution for Live Streaming", journal = j-TOMM, volume = "14", number = "2s", pages = "35:1--35:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183515", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Single-source HTTP adaptive streaming solutions (HAS) have become the de facto solutions to deliver live video over the Internet. By avoiding video stalling events that are mainly caused by the lack of throughput at client or at server side, HAS solutions increase the end users' quality of experience (QoE). We propose to pragmatically extend HAS with our MS-Stream solution that simultaneously utilizes several servers. MS-Stream aims at offering high QoE for live content delivery by exploiting expanded bandwidth and link diversity in distributed heterogeneous infrastructures. By leveraging end users' connectivity capacities, we further extend the QoE and scalability capabilities of our proposal by exposing a hybrid P2P/multisource live-streaming solution (P2P/MS-Stream (PMS)), achieving trade-offs between the system's scale and the end users' QoE. We propose a distributed quality adaptation algorithm run by every peer, along with a local optimization method of the usage of the server infrastructure made available. Large-scale evaluations conducted with 300 peers located in France permits validating our approach and algorithms over flash crowd events and allow us to conclude that PMS can reach the optimal trade-offs between QoE and system scale.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Floris:2018:QAO, author = "Alessandro Floris and Arslan Ahmad and Luigi Atzori", title = "{QoE}-Aware {OTT-ISP} Collaboration in Service Management: Architecture and Approaches", journal = j-TOMM, volume = "14", number = "2s", pages = "36:1--36:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3183517", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "It is a matter of fact that quality of experience (QoE) has become one of the key factors determining whether a new multimedia service will be successfully accepted by the final users. Accordingly, several QoE models have been developed with the aim of capturing the perception of the user by considering as many influencing factors as possible. However, when it comes to adopting these models in the management of the services and networks, it frequently happens that no single provider has access to all of the tools to either measure all influencing factors parameters or control over the delivered quality. In particular, it often happens to the over-the-top (OTT) and Internet service providers (ISPs), which act with complementary roles in the service delivery over the Internet. On the basis of this consideration, in this article we first highlight the importance of a possible OTT-ISP collaboration for a joint service management in terms of technical and economic aspects. Then we propose a general reference architecture for a possible collaboration and information exchange among them. Finally, we define three different approaches, namely joint venture, customer lifetime value based, and QoE fairness based. The first aims to maximize the revenue by providing better QoE to customers paying more. The second aims to maximize the profit by providing better QoE to the most profitable customers (MPCs). The third aims to maximize QoE fairness among all customers. Finally, we conduct simulations to compare the three approaches in terms of QoE provided to the users, profit generated for the providers, and QoE fairness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yan:2018:GES, author = "Yan Yan and Liqiang Nie and Rita Cucchiara", title = "Guest Editorial: Special Section on {``Multimedia Understanding via Multimodal Analytics''}", journal = j-TOMM, volume = "14", number = "2s", pages = "37:1--37:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3192334", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tiwari:2018:MMS, author = "Akanksha Tiwari and Christian {Von Der Weth} and Mohan S. Kankanhalli", title = "Multimodal Multiplatform Social Media Event Summarization", journal = j-TOMM, volume = "14", number = "2s", pages = "38:1--38:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3115433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Social media platforms are turning into important news sources since they provide real-time information from different perspectives. However, high volume, dynamism, noise, and redundancy exhibited by social media data make it difficult to comprehend the entire content. Recent works emphasize on summarizing the content of either a single social media platform or of a single modality (either textual or visual). However, each platform has its own unique characteristics and user base, which brings to light different aspects of real-world events. This makes it critical as well as challenging to combine textual and visual data from different platforms. In this article, we propose summarization of real-world events with data stemming from different platforms and multiple modalities. We present the use of a Markov Random Fields based similarity measure to link content across multiple platforms. This measure also enables the linking of content across time, which is useful for tracking the evolution of long-running events. For the final content selection, summarization is modeled as a subset selection problem. To handle the complexity of the optimal subset selection, we propose the use of submodular objectives. Facets such as coverage, novelty, and significance are modeled as submodular objectives in a multimodal social media setting. We conduct a series of quantitative and qualitative experiments to illustrate the effectiveness of our approach compared to alternative methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2018:SAM, author = "Anran Wang and Jianfei Cai and Jiwen Lu and Tat-Jen Cham", title = "Structure-Aware Multimodal Feature Fusion for {RGB-D} Scene Classification and Beyond", journal = j-TOMM, volume = "14", number = "2s", pages = "39:1--39:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3115932", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While convolutional neural networks (CNNs) have been excellent for object recognition, the greater spatial variability in scene images typically means that the standard full-image CNN features are suboptimal for scene classification. In this article, we investigate a framework allowing greater spatial flexibility, in which the Fisher vector (FV)-encoded distribution of local CNN features, obtained from a multitude of region proposals per image, is considered instead. The CNN features are computed from an augmented pixel-wise representation consisting of multiple modalities of RGB, HHA, and surface normals, as extracted from RGB-D data. More significantly, we make two postulates: (1) component sparsity-that only a small variety of region proposals and their corresponding FV GMM components contribute to scene discriminability, and (2) modal nonsparsity-that features from all modalities are encouraged to coexist. In our proposed feature fusion framework, these are implemented through regularization terms that apply group lasso to GMM components and exclusive group lasso across modalities. By learning and combining regressors for both proposal-based FV features and global CNN features, we are able to achieve state-of-the-art scene classification performance on the SUNRGBD Dataset and NYU Depth Dataset V2. Moreover, we further apply our feature fusion framework on an action recognition task to demonstrate that our framework can be generalized for other multimodal well-structured features. In particular, for action recognition, we enforce interpart sparsity to choose more discriminative body parts, and intermodal nonsparsity to make informative features from both appearance and motion modalities coexist. Experimental results on the JHMDB and MPII Cooking Datasets show that our feature fusion is also very effective for action recognition, achieving very competitive performance compared with the state of the art.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2018:ICD, author = "Cheng Wang and Haojin Yang and Christoph Meinel", title = "Image Captioning with Deep Bidirectional {LSTMs} and Multi-Task Learning", journal = j-TOMM, volume = "14", number = "2s", pages = "40:1--40:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3115432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Generating a novel and descriptive caption of an image is drawing increasing interests in computer vision, natural language processing, and multimedia communities. In this work, we propose an end-to-end trainable deep bidirectional LSTM (Bi-LSTM (Long Short-Term Memory)) model to address the problem. By combining a deep convolutional neural network (CNN) and two separate LSTM networks, our model is capable of learning long-term visual-language interactions by making use of history and future context information at high-level semantic space. We also explore deep multimodal bidirectional models, in which we increase the depth of nonlinearity transition in different ways to learn hierarchical visual-language embeddings. Data augmentation techniques such as multi-crop, multi-scale, and vertical mirror are proposed to prevent overfitting in training deep models. To understand how our models ``translate'' image to sentence, we visualize and qualitatively analyze the evolution of Bi-LSTM internal states over time. The effectiveness and generality of proposed models are evaluated on four benchmark datasets: Flickr8K, Flickr30K, MSCOCO, and Pascal1K datasets. We demonstrate that Bi-LSTM models achieve highly competitive performance on both caption generation and image-sentence retrieval even without integrating an additional mechanism (e.g., object detection, attention model). Our experiments also prove that multi-task learning is beneficial to increase model generality and gain performance. We also demonstrate the performance of transfer learning of the Bi-LSTM model significantly outperforms previous methods on the Pascal1K dataset.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2018:TPA, author = "Zhenguang Liu and Yingjie Xia and Qi Liu and Qinming He and Chao Zhang and Roger Zimmermann", title = "Toward Personalized Activity Level Prediction in Community Question Answering {Websites}", journal = j-TOMM, volume = "14", number = "2s", pages = "41:1--41:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3187011", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue May 29 08:39:06 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Community Question Answering (CQA) websites have become valuable knowledge repositories. Millions of internet users resort to CQA websites to seek answers to their encountered questions. CQA websites provide information far beyond a search on a site such as Google due to (1) the plethora of high-quality answers, and (2) the capabilities to post new questions toward the communities of domain experts. While most research efforts have been made to identify experts or to preliminarily detect potential experts of CQA websites, there has been a remarkable shift toward investigating how to keep the engagement of experts. Experts are usually the major contributors of high-quality answers and questions of CQA websites. Consequently, keeping the expert communities active is vital to improving the lifespan of these websites. In this article, we present an algorithm termed PALP to predict the activity level of expert users of CQA websites. To the best of our knowledge, PALP is the first approach to address a personalized activity level prediction model for CQA websites. Furthermore, it takes into consideration user behavior change over time and focuses specifically on expert users. Extensive experiments on the Stack Overflow website demonstrate the competitiveness of PALP over existing methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Abdallah:2018:AHD, author = "Maha Abdallah", title = "Aesthetic Highlight Detection in Movies Based on Synchronization of Spectators' Reactions", journal = j-TOMM, volume = "14", number = "3", pages = "68:1--68:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3175497", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Detection of aesthetic highlights is a challenge for understanding the affective processes taking place during movie watching. In this article, we study spectators' responses to movie aesthetic stimuli in a social context. Moreover, we look for uncovering the emotional component of aesthetic highlights in movies. Our assumption is that synchronized spectators' physiological and behavioral reactions occur during these highlights because: (i) aesthetic choices of filmmakers are made to elicit specific emotional reactions (e.g., special effects, empathy, and compassion toward a character) and (ii) watching a movie together causes spectators' affective reactions to be synchronized through emotional contagion. We compare different approaches to estimation of synchronization among multiple spectators' signals, such as pairwise, group, and overall synchronization measures to detect aesthetic highlights in movies. The results show that the unsupervised architecture relying on synchronization measures is able to capture different properties of spectators' synchronization and detect aesthetic highlights based on both spectators' electrodermal and acceleration signals. We discover that pairwise synchronization measures perform the most accurately independently of the category of the highlights and movie genres. Moreover, we observe that electrodermal signals have more discriminative power than acceleration signals for highlight detection.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bai:2018:ADA, author = "Yalong Bai and Kuiyuan Yang and Tao Mei and Wei-Ying Ma and Tiejun Zhao", title = "Automatic Data Augmentation from Massive {Web} Images for Deep Visual Recognition", journal = j-TOMM, volume = "14", number = "3", pages = "69:1--69:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3204941", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Large-scale image datasets and deep convolutional neural networks (DCNNs) are the two primary driving forces for the rapid progress in generic object recognition tasks in recent years. While lots of network architectures have been continuously designed to pursue lower error rates, few efforts are devoted to enlarging existing datasets due to high labeling costs and unfair comparison issues. In this article, we aim to achieve lower error rates by augmenting existing datasets in an automatic manner. Our method leverages both the web and DCNN, where the web provides massive images with rich contextual information, and DCNN replaces humans to automatically label images under the guidance of web contextual information. Experiments show that our method can automatically scale up existing datasets significantly from billions of web pages with high accuracy. The performance on object recognition tasks and transfer learning tasks have been significantly improved by using the automatically augmented datasets, which demonstrates that more supervisory information has been automatically gathered from the web. Both the dataset and models trained on the dataset have been made publicly available.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tan:2018:UCD, author = "Min Tan and Jun Yu and Zhou Yu and Fei Gao and Yong Rui and Dacheng Tao", title = "User-Click-Data-Based Fine-Grained Image Recognition via Weakly Supervised Metric Learning", journal = j-TOMM, volume = "14", number = "3", pages = "70:1--70:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209666", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present a novel fine-grained image recognition framework using user click data, which can bridge the semantic gap in distinguishing categories that are similar in visual. As query set in click data is usually large-scale and redundant, we first propose a click-feature-based query-merging approach to merge queries with similar semantics and construct a compact click feature. Afterward, we utilize this compact click feature and convolutional neural network (CNN)-based deep visual feature to jointly represent an image. Finally, with the combined feature, we employ the metric-learning-based template-matching scheme for efficient recognition. Considering the heavy noise in the training data, we introduce a reliability variable to characterize the image reliability, and propose a weakly-supervised metric and template leaning with smooth assumption and click prior (WMTLSC) method to jointly learn the distance metric, object templates, and image reliability. Extensive experiments are conducted on a public Clickture-Dog dataset and our newly established Clickture-Bird dataset. It is shown that the click-data-based query merging helps generating a highly compact (the dimension is reduced to 0.9\%) and dense click feature for images, which greatly improves the computational efficiency. Also, introducing this click feature into CNN feature further boosts the recognition accuracy. The proposed framework performs much better than previous state-of-the-arts in fine-grained recognition tasks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bentaleb:2018:OSO, author = "Abdelhak Bentaleb and Ali C. Begen and Roger Zimmermann", title = "{ORL--SDN}: Online Reinforcement Learning for {SDN}-Enabled {HTTP} Adaptive Streaming", journal = j-TOMM, volume = "14", number = "3", pages = "71:1--71:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3219752", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In designing an HTTP adaptive streaming (HAS) system, the bitrate adaptation scheme in the player is a key component to ensure a good quality of experience (QoE) for viewers. We propose a new online reinforcement learning optimization framework, called ORL-SDN, targeting HAS players running in a software-defined networking (SDN) environment. We leverage SDN to facilitate the orchestration of the adaptation schemes for a set of HAS players. To reach a good level of QoE fairness in a large population of players, we cluster them based on a perceptual quality index. We formulate the adaptation process as a Partially Observable Markov Decision Process and solve the per-cluster optimization problem using an online Q-learning technique that leverages model predictive control and parallelism via aggregation to avoid a per-cluster suboptimal selection and to accelerate the convergence to an optimum. This framework achieves maximum long-term revenue by selecting the optimal representation for each cluster under time-varying network conditions. The results show that ORL-SDN delivers substantial improvements in viewer QoE, presentation quality stability, fairness, and bandwidth utilization over well-known adaptation schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kong:2018:EVE, author = "Lingchao Kong and Rui Dai", title = "Efficient Video Encoding for Automatic Video Analysis in Distributed Wireless Surveillance Systems", journal = j-TOMM, volume = "14", number = "3", pages = "72:1--72:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226036", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In many distributed wireless surveillance applications, compressed videos are used for performing automatic video analysis tasks. The accuracy of object detection, which is essential for various video analysis tasks, can be reduced due to video quality degradation caused by lossy compression. This article introduces a video encoding framework with the objective of boosting the accuracy of object detection for wireless surveillance applications. The proposed video encoding framework is based on systematic investigation of the effects of lossy compression on object detection. It has been found that current standardized video encoding schemes cause temporal domain fluctuation for encoded blocks in stable background areas and spatial texture degradation for encoded blocks in dynamic foreground areas of a raw video, both of which degrade the accuracy of object detection. Two measures, the sum-of-absolute frame difference (SFD) and the degradation of texture in 2D transform domain (TXD), are introduced to depict the temporal domain fluctuation and the spatial texture degradation in an encoded video, respectively. The proposed encoding framework is designed to suppress unnecessary temporal fluctuation in stable background areas and preserve spatial texture in dynamic foreground areas based on the two measures, and it introduces new mode decision strategies for both intra- and interframes to improve the accuracy of object detection while maintaining an acceptable rate distortion performance. Experimental results show that, compared with traditional encoding schemes, the proposed scheme improves the performance of object detection and results in lower bit rates and significantly reduced complexity with comparable quality in terms of PSNR and SSIM.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2018:ICA, author = "Anqi Wang and Haifeng Hu and Liang Yang", title = "Image Captioning with Affective Guiding and Selective Attention", journal = j-TOMM, volume = "14", number = "3", pages = "73:1--73:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226037", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image captioning is an increasingly important problem associated with artificial intelligence, computer vision, and natural language processing. Recent works revealed that it is possible for a machine to generate meaningful and accurate sentences for images. However, most existing methods ignore latent emotional information in an image. In this article, we propose a novel image captioning model with Affective Guiding and Selective Attention Mechanism named AG-SAM. In our method, we aim to bridge the affective gap between image captioning and the emotional response elicited by the image. First, we introduce affective components that capture higher-level concepts encoded in images into AG-SAM. Hence, our language model can be adapted to generate sentences that are more passionate and emotive. In addition, a selective gate acting on the attention mechanism controls the degree of how much visual information AG-SAM needs. Experimental results have shown that our model outperforms most existing methods, clearly reflecting an association between images and emotional components that is usually ignored in existing works.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sikora:2018:SAS, author = "Marjan Sikora and Mladen Russo and Jurica Derek and Ante Jurcevi{\'c}", title = "Soundscape of an Archaeological Site Recreated with Audio Augmented Reality", journal = j-TOMM, volume = "14", number = "3", pages = "74:1--74:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3230652", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article investigates the use of an audio augmented reality (AAR) system to recreate the soundscape of a medieval archaeological site. The aim of our work was to explore whether it is possible to enhance a tourist's archaeological experience, which is often derived from only scarce remains. We developed a smartphone-based AAR system, which uses location and orientation sensors to synthesize the soundscape of a site and plays it to the user via headphones. We recreated the ancient soundscape of a medieval archaeological site in Croatia and tested it in situ on two groups of participants using the soundwalk method. One test group performed the soundwalk while listening to the recreated soundscape using the AAR system, while the second control group did not use the AAR equipment. We measured the experiences of the participants using two methods: the standard soundwalk questionnaire and affective computing equipment for detecting the emotional state of participants. The results of both test methods show that participants who were listening to the ancient soundscape using our AAR system experienced higher arousal than those visiting the site without AAR.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kirchhoffer:2018:PDV, author = "Heiner Kirchhoffer and Detlev Marpe and Heiko Schwarz and Thomas Wiegand", title = "Properties and Design of Variable-to-Variable Length Codes", journal = j-TOMM, volume = "14", number = "3", pages = "75:1--75:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3230653", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For the entropy coding of independent and identically distributed (i.i.d.) binary sources, variable-to-variable length (V2V) codes are an interesting alternative to arithmetic coding. Such a V2V code translates variable length words of the source into variable length code words by employing two prefix-free codes. In this article, several properties of V2V codes are studied, and new concepts are developed. In particular, it is shown that the redundancy of a V2V code cannot be zero for a binary i.i.d. source {X} with 0 \< p$_X$ (1) \< 0.5. Furthermore, the concept of prime and composite V2V codes is proposed, and it is shown why composite V2V codes can be disregarded in the search for particular classes of minimum redundancy codes. Moreover, a canonical representation for V2V codes is proposed, which identifies V2V codes that have the same average code length function. It is shown how these concepts can be employed to greatly reduce the complexity of a search for minimum redundancy (size-limited) V2V codes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kiess:2018:SCA, author = "Johannes Kiess and Stephan Kopf and Benjamin Guthier and Wolfgang Effelsberg", title = "A Survey on Content-Aware Image and Video Retargeting", journal = j-TOMM, volume = "14", number = "3", pages = "76:1--76:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3231598", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This survey introduces the current state of the art in image and video retargeting and describes important ideas and technologies that have influenced the recent work. Retargeting is the process of adapting an image or video from one screen resolution to another to fit different displays, for example, when watching a wide screen movie on a normal television screen or a mobile device. As there has been considerable work done in this field already, this survey provides an overview of the techniques. It is meant to be a starting point for new research in the field. We include explanations of basic terms and operators, as well as the basic workflow of the different methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cecil:2018:NBV, author = "J. Cecil and Avinash Gupta and M. Pirela-Cruz and Parmesh Ramanathan", title = "A Network-Based Virtual Reality Simulation Training Approach for Orthopedic Surgery", journal = j-TOMM, volume = "14", number = "3", pages = "77:1--77:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3232678", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The focus of this article is on the adoption of immersive and haptic simulators for training of medical residents in a surgical process called Less Invasive Stabilization System (LISS) plating surgery. LISS surgery is an orthopedic surgical procedure to treat fractures of the femur bone. Development of such simulators is a complex task which involves multiple systems, technologies, and human experts. Emerging Next Generation Internet technologies were used to develop the standalone on-line haptic-based simulator accessible to the students 24/7. A standalone immersive surgical simulator was also developed using HTC Vive. Expert surgeons played an important role in developing the simulator system; use cases of the target surgical processes were built using a modeling language called the engineering Enterprise Modeling Language (eEML). A detailed study presenting the comparison between the haptic-based simulator and the immersive simulator has been also presented. The outcomes of this study underscore the potential of using such simulators in surgical training.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Dong:2018:LMK, author = "Husheng Dong and Ping Lu and Chunping Liu and Yi Ji and Shengrong Gong", title = "Learning Multiple Kernel Metrics for Iterative Person Re-Identification", journal = j-TOMM, volume = "14", number = "3", pages = "78:1--78:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3234929", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In person re-identification most metric learning methods learn from training data only once, and then they are deployed for testing. Although impressive performance has been achieved, the discriminative information from successfully identified test samples are ignored. In this work, we present a novel re-identification framework termed Iterative Multiple Kernel Metric Learning (IMKML). Specifically, there are two main modules in IMKML. In the first module, multiple metrics are learned via a new derived Kernel Marginal Nullspace Learning (KMNL) algorithm. Taking advantage of learning a discriminative nullspace from neighborhood manifold, KMNL can well tackle the Small Sample Size (SSS) problem in re-identification distance metric learning. The second module is to construct a pseudo training set by performing re-identification on the testing set. The pseudo training set, which consists of the test image pairs that are highly probable correct matches, is then inserted into the labeled training set to retrain the metrics. By iteratively alternating between the two modules, many more samples will be involved for training and significant performance gains can be achieved. Experiments on four challenging datasets, including VIPeR, PRID450S, CUHK01, and Market-1501, show that the proposed method performs favorably against the state-of-the-art approaches, especially on the lower ranks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Abdallah:2018:ISI, author = "Maha Abdallah and Kuan-Ta Chen and Carsten Griwodz and Cheng-Hsin Hsu", title = "Introduction to the Special Issue on Delay-Sensitive Video Computing in the Cloud", journal = j-TOMM, volume = "14", number = "3s", pages = "53:1--53:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3214698", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Abdallah:2018:DSV, author = "Maha Abdallah and Carsten Griwodz and Kuan-Ta Chen and Gwendal Simon and Pin-Chun Wang and Cheng-Hsin Hsu", title = "Delay-Sensitive Video Computing in the Cloud: a Survey", journal = j-TOMM, volume = "14", number = "3s", pages = "54:1--54:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3212804", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While cloud servers provide a tremendous amount of resources for networked video applications, most successful stories of cloud-assisted video applications are presentational video services, such as YouTube and NetFlix. This article surveys the recent advances on delay-sensitive video computations in the cloud, which are crucial to cloud-assisted conversational video services, such as cloud gaming, Virtual Reality (VR), Augmented Reality (AR), and telepresence. Supporting conversational video services with cloud resources is challenging because most cloud servers are far away from the end users while these services incur the following stringent requirements: high bandwidth, short delay, and high heterogeneity. In this article, we cover the literature with a top-down approach: from applications and experience, to architecture and management, and to optimization in and outside of the cloud. We also point out major open challenges, hoping to stimulate more research activities in this emerging and exciting direction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2018:CES, author = "Yusen Li and Yunhua Deng and Xueyan Tang and Wentong Cai and Xiaoguang Liu and Gang Wang", title = "Cost-Efficient Server Provisioning for Cloud Gaming", journal = j-TOMM, volume = "14", number = "3s", pages = "55:1--55:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190838", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud gaming has gained significant popularity recently due to many important benefits such as removal of device constraints, instant-on, and cross-platform. The properties of intensive resource demands and dynamic workloads make cloud gaming appropriate to be supported by an elastic cloud platform. Facing a large user population, a fundamental problem is how to provide satisfactory cloud gaming service at modest cost. We observe that the software storage cost could be substantial compared to the server running cost in cloud gaming using elastic cloud resources. Therefore, in this article, we address the server provisioning problem for cloud gaming to optimize both the server running cost and the software storage cost. We find that the distribution of game software among servers and the selection of server types both trigger tradeoffs between the software storage cost and the server running cost in cloud gaming. We formulate the problem with a stochastic model and employ queueing theory to conduct a solid theoretical analysis of the system behaviors under different request dispatching policies. We then propose several classes of algorithms to approximate the optimal solution. The proposed algorithms are evaluated by extensive experiments using real-world parameters. The results show that the proposed Ordered and Genetic algorithms are computationally efficient, nearly cost-optimal, and highly robust to dynamic changes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Slivar:2018:GCD, author = "Ivan Slivar and Mirko Suznjevic and Lea Skorin-Kapov", title = "Game Categorization for Deriving {QoE}-Driven Video Encoding Configuration Strategies for Cloud Gaming", journal = j-TOMM, volume = "14", number = "3s", pages = "56:1--56:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3132041", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud gaming has been recognized as a promising shift in the online game industry, with the aim of implementing the ``on demand'' service concept that has achieved market success in other areas of digital entertainment such as movies and TV shows. The concepts of cloud computing are leveraged to render the game scene as a video stream that is then delivered to players in real-time. The main advantage of this approach is the capability of delivering high-quality graphics games to any type of end user device; however, at the cost of high bandwidth consumption and strict latency requirements. A key challenge faced by cloud game providers lies in configuring the video encoding parameters so as to maximize player Quality of Experience (QoE) while meeting bandwidth availability constraints. In this article, we tackle one aspect of this problem by addressing the following research question: Is it possible to improve service adaptation based on information about the characteristics of the game being streamed? To answer this question, two main challenges need to be addressed: the need for different QoE-driven video encoding (re-)configuration strategies for different categories of games, and how to determine a relevant game categorization to be used for assigning appropriate configuration strategies. We investigate these problems by conducting two subjective laboratory studies with a total of 80 players and three different games. Results indicate that different strategies should likely be applied for different types of games, and show that existing game classifications are not necessarily suitable for differentiating game types in this context. We thus further analyze objective video metrics of collected game play video traces as well as player actions per minute and use this as input data for clustering of games into two clusters. Subjective results verify that different video encoding configuration strategies may be applied to games belonging to different clusters.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Claypool:2018:GID, author = "Mark Claypool", title = "Game Input with Delay-Moving Target Selection with a Game Controller Thumbstick", journal = j-TOMM, volume = "14", number = "3s", pages = "57:1--57:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3187288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hosting interactive video-based services, such as computer games, in the Cloud poses particular challenges given user sensitivity to delay. A better understanding of the impact of delay on player-game interactions can help design cloud systems and games that accommodate delays inherent in cloud systems. Previous top-down studies of delay using full-featured games have helped understand the impact of delay, but often do not generalize or lend themselves to analytic modeling. Bottom-up studies isolating user input and delay can better generalize and be used in models, but have yet to be applied to cloud-hosted computer games. In order to better understand delay impact in cloud-hosted computer games, we conduct a large bottom-up user study centered on a fundamental game interaction-selecting a moving target with user input impeded by delay. Our work builds a custom game that controls both the target speed and input delay and has players select the target using a game controller analog thumbstick. Analysis of data from over 50 users shows target selection time exponentially increases with delay and target speed and is well-fit by an exponential model that includes a delay and target speed interaction term. A comparison with two previous studies, both using a mouse instead of a thumbstick, suggests the model's relationship between selection time, delay, and target speed holds more broadly, providing a foundation for a potential law explaining moving target selection with delay encountered in cloud-hosted games.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hou:2018:NHC, author = "Xueshi Hou and Yao Lu and Sujit Dey", title = "Novel Hybrid-Cast Approach to Reduce Bandwidth and Latency for Cloud-Based Virtual Space", journal = j-TOMM, volume = "14", number = "3s", pages = "58:1--58:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3205864", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we explore the possibility of enabling cloud-based virtual space applications for better computational scalability and easy access from any end device, including future lightweight wireless head-mounted displays. In particular, we investigate virtual space applications such as virtual classroom and virtual gallery, in which the scenes and activities are rendered in the cloud, with multiple views captured and streamed to each end device. A key challenge is the high bandwidth requirement to stream all the user views, leading to high operational cost and potential large delay in a bandwidth-restricted wireless network. We propose a novel hybrid-cast approach to save bandwidth in a multi-user streaming scenario. We identify and broadcast the common pixels shared by multiple users, while unicasting the residual pixels for each user. We formulate the problem of minimizing the total bitrate needed to transmit the user views using hybrid-casting and describe our approach. A common view extraction approach and a smart grouping algorithm are proposed and developed to achieve our hybrid-cast approach. Simulation results show that the hybrid-cast approach can significantly reduce total bitrate by up to 55\% and avoid congestion-related latency, compared to traditional cloud-based approach of transmitting all the views as individual unicast streams, hence addressing the bandwidth challenges of the cloud, with additional benefits in cost and delay.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2018:CBC, author = "Chang Liu and Wei Tsang Ooi and Jinyuan Jia and Lei Zhao", title = "{Cloud Baking}: Collaborative Scene Illumination for Dynamic {Web$3$D} Scenes", journal = j-TOMM, volume = "14", number = "3s", pages = "59:1--59:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3206431", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We propose Cloud Baking, a collaborative rendering architecture for dynamic Web3D scenes. In our architecture, the cloud renderer renders the scene with the global illumination (GI) information in a GI map; the web-based client renderer renders the scene with ambient lighting only and blends it with the GI map received from the cloud for the final scene. This approach allows the users to interact with the web scene and change the scene dynamically through the web interface end, yet move the computationally heavy tasks of global illumination computation to the cloud. A challenge we face is the interaction delay that causes the frames rendered on the cloud and the client to go out of sync. We propose to use 3D warping and a hole-filling algorithm designed for GI map to predict the late GI map. We show both quantitatively and visually the quality of the GI map produced using our method. Our prediction algorithm allows us to further reduce the frequency at which the GI map is computed and sent from the server, reducing both computational needs and bandwidth usage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cesar:2018:BPA, author = "Pablo Cesar and Cheng-Hsin Hsu and Chun-Ying Huang and Pan Hui", title = "Best Papers of the {ACM Multimedia Systems (MMSys) Conference 2017} and the {ACM Workshop on Network and Operating System Support for Digital Audio and Video (NOSSDAV) 2017}", journal = j-TOMM, volume = "14", number = "3s", pages = "60:1--60:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3214700", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zahran:2018:AAS, author = "Ahmed H. Zahran and Jason J. Quinlan and K. K. Ramakrishnan and Cormac J. Sreenan", title = "{ASAP}: Adaptive Stall-Aware Pacing for Improved {DASH} Video Experience in Cellular Networks", journal = j-TOMM, volume = "14", number = "3s", pages = "61:1--61:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3219750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The dramatic growth of video traffic represents a practical challenge for cellular network operators in providing a consistent streaming Quality of Experience (QoE) to their users. Satisfying this objective has so-far proved elusive, due to the inherent characteristics of wireless networks and varying channel conditions as well as variability in the video bitrate that can degrade streaming performance. In this article, we propose stall-aware pacing as a novel MPEG DASH video traffic management solution that reduces playback stalls and seeks to maintain a consistent QoE for cellular users, even those with diverse channel conditions. These goals are achieved by leveraging both network and client state information to optimize the pacing of individual video flows. We evaluate the performance of two versions of stall-aware pacing techniques extensively, including stall-aware pacing (SAP) and adaptive stall-aware pacing (ASAP), using real video content and clients, operating over a simulated LTE network. We implement state-of-the-art client adaptation and traffic management strategies for direct comparisons with SAP and ASAP. Our results, using a heavily loaded base station, show that SAP reduces the number of stalls and the average stall duration per session by up to 95\%. Additionally, SAP ensures that clients with good channel conditions do not dominate available wireless resources, evidenced by a reduction of up to 40\% in the standard deviation of the QoE metric across clients. We also show that ASAP achieves additional performance gains by adaptively pacing video streams based on the application buffer state.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhou:2018:EOP, author = "Chao Zhou and Zhenhua Li and Joe Osgood and Yao Liu", title = "On the Effectiveness of Offset Projections for $ 360$-Degree Video Streaming", journal = j-TOMM, volume = "14", number = "3s", pages = "62:1--62:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209660", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "A new generation of video streaming technology, 360-degree video, promises greater immersiveness than standard video streams. This level of immersiveness is similar to that produced by virtual reality devices-users can control the field of view using head movements rather than needing to manipulate external devices. Although 360-degree video could revolutionize the streaming experience, its large-scale adoption is hindered by a number of factors: 360-degree video streams have larger bandwidth requirements and require faster responsiveness to user inputs, and users may be more sensitive to lower quality streams. In this article, we review standard approaches toward 360-degree video encoding and compare these to families of approaches that distort the spherical surface to allow oriented concentrations of the 360-degree view. We refer to these distorted projections as offset projections. Our measurement studies show that most types of offset projections produce rendered views with better quality than their nonoffset equivalents when view orientations are within 40 or 50 degrees of the offset orientation. Offset projections complicate adaptive 360-degree video streaming because they require a combination of bitrate and view orientation adaptations. We estimate that this combination of streaming adaptation in two dimensions can cause over 57\% extra segments to be downloaded compared to an ideal downloading strategy, wasting 20\% of the total downloading bandwidth.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bahirat:2018:DEM, author = "Kanchan Bahirat and Chengyuan Lai and Ryan P. Mcmahan and Balakrishnan Prabhakaran", title = "Designing and Evaluating a Mesh Simplification Algorithm for Virtual Reality", journal = j-TOMM, volume = "14", number = "3s", pages = "63:1--63:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209661", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the increasing accessibility of the mobile head-mounted displays (HMDs), mobile virtual reality (VR) systems are finding applications in various areas. However, mobile HMDs are highly constrained with limited graphics processing units (GPUs) and low processing power and onboard memory. Hence, VR developers must be cognizant of the number of polygons contained within their virtual environments to avoid rendering at low frame rates and inducing simulator sickness. The most robust and rapid approach to keeping the overall number of polygons low is to use mesh simplification algorithms to create low-poly versions of pre-existing, high-poly models. Unfortunately, most existing mesh simplification algorithms cannot adequately handle meshes with lots of boundaries or nonmanifold meshes, which are common attributes of many 3D models. In this article, we present QEM$_{4VR}$, a high-fidelity mesh simplification algorithm specifically designed for VR. This algorithm addresses the deficiencies of prior quadric error metric (QEM) approaches by leveraging the insight that the most relevant boundary edges lie along curvatures while linear boundary edges can be collapsed. Additionally, our algorithm preserves key surface properties, such as normals, texture coordinates, colors, and materials, as it preprocesses 3D models and generates their low-poly approximations offline. We evaluated the effectiveness of our QEM$_{4VR}$ algorithm by comparing its simplified-mesh results to those of prior QEM variations in terms of geometric approximation error, texture error, progressive approximation errors, frame rate impact, and perceptual quality measures. We found that QEM$_{4VR}$ consistently yielded simplified meshes with less geometric approximation error and texture error than the prior QEM variations. It afforded better frame rates than QEM variations with boundary preservation constraints that create unnecessary lower bounds on overall polygon count reduction. Our evaluation revealed that QEM$_{4VR}$ did not fair well in terms of existing perceptual distance measurements, but human-based inspections demonstrate that these algorithmic measurements are not suitable substitutes for actual human perception. In turn, we present a user-based methodology for evaluating the perceptual qualities of mesh simplification algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2018:ELV, author = "Junjue Wang and Brandon Amos and Anupam Das and Padmanabhan Pillai and Norman Sadeh and Mahadev Satyanarayanan", title = "Enabling Live Video Analytics with a Scalable and Privacy-Aware Framework", journal = j-TOMM, volume = "14", number = "3s", pages = "64:1--64:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209659", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We show how to build the components of a privacy-aware, live video analytics ecosystem from the bottom up, starting with OpenFace, our new open-source face recognition system that approaches state-of-the-art accuracy. Integrating OpenFace with interframe tracking, we build RTFace, a mechanism for denaturing video streams that selectively blurs faces according to specified policies at full frame rates. This enables privacy management for live video analytics while providing a secure approach for handling retrospective policy exceptions. Finally, we present a scalable, privacy-aware architecture for large camera networks using RTFace and show how it can be an enabler for a vibrant ecosystem and marketplace of privacy-aware video streams and analytics services.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gudmundsson:2018:PWS, author = "Gylfi {\Thorn}{\'o}r Gudmundsson and Bj{\"o}rn {\Thorn}{\'o}r J{\'o}nsson and Laurent Amsaleg and Michael J. Franklin", title = "Prototyping a {Web}-Scale Multimedia Retrieval Service Using {Spark}", journal = j-TOMM, volume = "14", number = "3s", pages = "65:1--65:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209662", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The world has experienced phenomenal growth in data production and storage in recent years, much of which has taken the form of media files. At the same time, computing power has become abundant with multi-core machines, grids, and clouds. Yet it remains a challenge to harness the available power and move toward gracefully searching and retrieving from web-scale media collections. Several researchers have experimented with using automatically distributed computing frameworks, notably Hadoop and Spark, for processing multimedia material, but mostly using small collections on small computing clusters. In this article, we describe a prototype of a (near) web-scale throughput-oriented MM retrieval service using the Spark framework running on the AWS cloud service. We present retrieval results using up to 43 billion SIFT feature vectors from the public YFCC 100M collection, making this the largest high-dimensional feature vector collection reported in the literature. We also present a publicly available demonstration retrieval system, running on our own servers, where the implementation of the Spark pipelines can be observed in practice using standard image benchmarks, and downloaded for research purposes. Finally, we describe a method to evaluate retrieval quality of the ever-growing high-dimensional index of the prototype, without actually indexing a web-scale media collection.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ma:2018:CUB, author = "Ming Ma and Lei Zhang and Jiangchuan Liu and Zhi Wang and Haitian Pang and Lifeng Sun and Weihua Li and Guangling Hou and Kaiyan Chu", title = "Characterizing User Behaviors in Mobile Personal Livecast: Towards an Edge Computing-assisted Paradigm", journal = j-TOMM, volume = "14", number = "3s", pages = "66:1--66:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3219751", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mobile personal livecast (MPL) services are emerging and have received great attention recently. In MPL, numerous and geo-distributed ordinary people broadcast their video contents to worldwide viewers. Different from conventional social networking services like Twitter and Facebook, which have a tolerance for interaction delay, the interactions (e.g., chat messages) in a personal livecast must be in real-time with low feedback latency. These unique characteristics inspire us to: (1) investigate how the relationships (e.g., social links and geo-locations) between viewers and broadcasters influence the user behaviors, which has yet to be explored in depth; and (2) explore insights to benefit the improvement of system performance. In this article, we carry out extensive measurements of a representative MPL system, with a large-scale dataset containing 11M users. In the current costly and limited cloud-based MPL system, which is faced with scalability problem, we find: (1) the long content uploading distances between broadcasters and cloud ingesting servers result in an impaired system QoS, including a high broadcast latency and a frequently buffering events; and (2) most of the broadcasters in MPL are geographically locally popular (the majority of the views come from the same region of the broadcaster), which consume vast computation and bandwidth resources of the clouds and Content Delivery Networks. Fortunately, the emergence of edge computing, which provides cloud-computing capabilities at the edge of the mobile network, naturally sheds new light on the MPL system; i.e., localized ingesting, transcoding, and delivering locally popular live content is possible. Based on these critical observations, we propose an edge-assisted MPL system that collaboratively utilizes the core-cloud and abundant edge computing resources to improve the system efficiency and scalability. In our framework, we consider a dynamic broadcaster assignment to minimize the broadcast latency while keeping the resource lease cost low. We formulate the broadcaster scheduling as a stable matching with migration problem to solve it effectively. Compared with the current pure cloud-based system, our edge-assisted delivery approach reduces the broadcast latency by about 35\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Huang:2018:UBA, author = "Lei Huang and Bowen Ding and Aining Wang and Yuedong Xu and Yipeng Zhou and Xiang Li", title = "User Behavior Analysis and Video Popularity Prediction on a Large-Scale {VoD} System", journal = j-TOMM, volume = "14", number = "3s", pages = "67:1--67:??", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226035", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Understanding streaming user behavior is crucial to the design of large-scale Video-on-Demand (VoD) systems. In this article, we begin with the measurement of individual viewing behavior from two aspects: the temporal characteristics and user interest. We observe that active users spend more hours on each active day, and their daily request time distribution is more scattered than that of the less active users, while the inter-view time distribution differs negligibly between two groups. The common interest in popular videos and the latest uploaded videos is observed in both groups. We then investigate the predictability of video popularity as a collective user behavior through early views. In the light of the limitations of classical approaches, the Autoregressive-Moving-Average (ARMA) model is employed to forecast the popularity dynamics of individual videos at fine-grained time scales, thus achieving much higher prediction accuracy. When applied to video caching, the ARMA-assisted Least Frequently Used (LFU) algorithm can outperform the Least Recently Used (LRU) by 11--16\%, the well-tuned LFU by 6--13\%, and the LFU is only 2--4\% inferior to the offline LFU in terms of hit ratio.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2018:JHA, author = "Junfeng Zhang and Haifeng Hu", title = "Joint Head Attribute Classifier and Domain-Specific Refinement Networks for Face Alignment", journal = j-TOMM, volume = "14", number = "4", pages = "79:1--79:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241059", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, a two-stage refinement network is proposed for facial landmarks detection on unconstrained conditions. Our model can be divided into two modules, namely the Head Attribude Classifier (HAC) module and the Domain-Specific Refinement (DSR) module. Given an input facial image, HAC adopts multi-task learning mechanism to detect the head pose and obtain an initial shape. Based on the obtained head pose, DSR designs three different CNN-based refinement networks trained by specific domain, respectively, and automatically selects the most approximate network for the landmarks refinement. Different from existing two-stage models, HAC combines head pose prediction with facial landmarks estimation to improve the accuracy of head pose prediction, as well as obtaining a robust initial shape. Moreover, an adaptive sub-network training strategy applied in the DSR module can effectively solve the issue of traditional multi-view methods that an improperly selected sub-network may result in alignment failure. The extensive experimental results on two public datasets, AFLW and 300W, confirm the validity of our model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{PascottiValem:2018:USL, author = "Lucas {Pascotti Valem} and Carlos {Renan De Oliveira} and Daniel Carlos {Guimar{\~a}es Pedronette} and Jurandy Almeida", title = "Unsupervised Similarity Learning through Rank Correlation and {kNN} Sets", journal = j-TOMM, volume = "14", number = "4", pages = "80:1--80:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241053", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The increasing amount of multimedia data collections available today evinces the pressing need for methods capable of indexing and retrieving this content. Despite the continuous advances in multimedia features and representation models, to establish an effective measure for comparing different multimedia objects still remains a challenging task. While supervised and semi-supervised techniques made relevant advances on similarity learning tasks, scenarios where labeled data are non-existent require different strategies. In such situations, unsupervised learning has been established as a promising solution, capable of considering the contextual information and the dataset structure for computing new similarity/dissimilarity measures. This article extends a recent unsupervised learning algorithm that uses an iterative re-ranking strategy to take advantage of different k -Nearest Neighbors (kNN) sets and rank correlation measures. Two novel approaches are proposed for computing the kNN sets and their corresponding top- k lists. The proposed approaches were validated in conjunction with various rank correlation measures, yielding superior effectiveness results in comparison with previous works. In addition, we also evaluate the ability of the method in considering different multimedia objects, conducting an extensive experimental evaluation on various image and video datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2018:TLD, author = "Hui-Yin Wu and Francesca Pal{\`u} and Roberto Ranon and Marc Christie", title = "Thinking Like a Director: Film Editing Patterns for Virtual Cinematographic Storytelling", journal = j-TOMM, volume = "14", number = "4", pages = "81:1--81:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241057", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article introduces Film Editing Patterns (FEP), a language to formalize film editing practices and stylistic choices found in movies. FEP constructs are constraints, expressed over one or more shots from a movie sequence, that characterize changes in cinematographic visual properties, such as shot sizes, camera angles, or layout of actors on the screen. We present the vocabulary of the FEP language, introduce its usage in analyzing styles from annotated film data, and describe how it can support users in the creative design of film sequences in 3D. More specifically, (i) we define the FEP language, (ii) we present an application to craft filmic sequences from 3D animated scenes that uses FEPs as a high level mean to select cameras and perform cuts between cameras that follow best practices in cinema, and (iii) we evaluate the benefits of FEPs by performing user experiments in which professional filmmakers and amateurs had to create cinematographic sequences. The evaluation suggests that users generally appreciate the idea of FEPs, and that it can effectively help novice and medium experienced users in crafting film sequences with little training.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yu:2018:SPI, author = "Tuo Yu and Haiming Jin and Wai-Tian Tan and Klara Nahrstedt", title = "{SKEPRID}: Pose and Illumination Change-Resistant Skeleton-Based Person Re-Identification", journal = j-TOMM, volume = "14", number = "4", pages = "82:1--82:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243217", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Currently, the surveillance camera-based person re-identification is still challenging because of diverse factors such as people's changing poses and various illumination. The various poses make it hard to conduct feature matching across images, and the illumination changes make color-based features unreliable. In this article, we present SKEPRID,$^1$ a skeleton-based person re-identification method that handles strong pose and illumination changes jointly. To reduce the impacts of pose changes on re-identification, we estimate the joints' positions of a person based on the deep learning technique and thus make it possible to extract features on specific body parts with high accuracy. Based on the skeleton information, we design a set of local color comparison-based cloth-type features, which are resistant to various lighting conditions. Moreover, to better evaluate SKEPRID, we build the PO8LI$^2$ dataset, which has large pose and illumination diversity. Our experimental results show that SKEPRID outperforms state-of-the-art approaches in the case of strong pose and illumination variation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Fan:2018:UPR, author = "Hehe Fan and Liang Zheng and Chenggang Yan and Yi Yang", title = "Unsupervised Person Re-identification: Clustering and Fine-tuning", journal = j-TOMM, volume = "14", number = "4", pages = "83:1--83:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243316", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The superiority of deeply learned pedestrian representations has been reported in very recent literature of person re-identification (re-ID). In this article, we consider the more pragmatic issue of learning a deep feature with no or only a few labels. We propose a progressive unsupervised learning (PUL) method to transfer pretrained deep representations to unseen domains. Our method is easy to implement and can be viewed as an effective baseline for unsupervised re-ID feature learning. Specifically, PUL iterates between (1) pedestrian clustering and (2) fine-tuning of the convolutional neural network (CNN) to improve the initialization model trained on the irrelevant labeled dataset. Since the clustering results can be very noisy, we add a selection operation between the clustering and fine-tuning. At the beginning, when the model is weak, CNN is fine-tuned on a small amount of reliable examples that locate near to cluster centroids in the feature space. As the model becomes stronger, in subsequent iterations, more images are being adaptively selected as CNN training samples. Progressively, pedestrian clustering and the CNN model are improved simultaneously until algorithm convergence. This process is naturally formulated as self-paced learning. We then point out promising directions that may lead to further improvement. Extensive experiments on three large-scale re-ID datasets demonstrate that PUL outputs discriminative features that improve the re-ID accuracy. Our code has been released at https://github.com/hehefan/Unsupervised-Person-Re-identification-Clustering-and-Fine-tuning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lin:2018:REN, author = "Xiaodan Lin and Xiangui Kang", title = "Robust Electric Network Frequency Estimation with Rank Reduction and Linear Prediction", journal = j-TOMM, volume = "14", number = "4", pages = "84:1--84:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241058", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article deals with the problem of Electric Network Frequency (ENF) estimation where Signal to Noise Ratio (SNR) is an essential challenge. By exploiting the low-rank structure of the ENF signal from the audio spectrogram, we propose an approach based on robust principle component analysis to get rid of the interference from speech contents and some of the background noise, which in our case can be regarded as sparse in nature. Weighted linear prediction is enforced on the low-rank signal subspace to gain accurate ENF estimation. The performance of the proposed scheme is analyzed and evaluated as a function of SNR, and the Cram{\'e}r-Rao Lower Bound (CRLB) is approached at an SNR level above -10 dB. Experiments on real datasets have demonstrated the advantages of the proposed method over state-of-the-art work in terms of estimation accuracy. Specifically, the proposed scheme can effectively capture the ENF fluctuations along the time axis using small numbers of signal observations while preserving sufficient frequency precision.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2018:PMB, author = "Yue Li and Gaobo Yang and Yapei Zhu and Xiangling Ding and Rongrong Gong", title = "Probability Model-Based Early Merge Mode Decision for Dependent Views Coding in {$3$D-HEVC}", journal = j-TOMM, volume = "14", number = "4", pages = "85:1--85:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3267128", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As a 3D extension to the High Efficiency Video Coding (HEVC) standard, 3D-HEVC was developed to improve the coding efficiency of multiview videos. It inherits the prediction modes from HEVC, yet both Motion Estimation (ME) and Disparity Estimation (DE) are required for dependent views coding. This improves coding efficiency at the cost of huge computational costs. In this article, an early Merge mode decision approach is proposed for dependent texture views and dependent depth maps coding in 3D-HEVC based on priori and posterior probability models. First, the priori probability model is established by exploiting the hierarchical and interview correlations from those previously encoded blocks. Second, the posterior probability model is built by using the Coded Block Flag (CBF) of the current coding block. Finally, the joint priori and posterior probability model is adopted to early terminate the Merge mode decision for both dependent texture views and dependent depth maps coding. Experimental results show that the proposed approach saves 45.2\% and 30.6\% encoding time on average for dependent texture views and dependent depth maps coding while maintaining negligible loss of coding efficiency, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Santos:2018:HAS, author = "Joel A. F. {Dos Santos} and D{\'e}bora C. Muchaluat-Saade and C{\'e}cile Roisin and Nabil Laya{\"\i}da", title = "A Hybrid Approach for Spatio-Temporal Validation of Declarative Multimedia Documents", journal = j-TOMM, volume = "14", number = "4", pages = "86:1--86:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3267127", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Declarative multimedia documents represent the description of multimedia applications in terms of media items and relationships among them. Relationships specify how media items are dynamically arranged in time and space during runtime. Although a declarative approach usually facilitates the authoring task, authors can still make mistakes due to incorrect use of language constructs or inconsistent or missing relationships in a document. In order to properly support multimedia application authoring, it is important to provide tools with validation capabilities. Document validation can indicate possible inconsistencies in a given document to an author so that it can be revised before deployment. Although very useful, multimedia validation tools are not often provided by authoring tools. This work proposes a multimedia validation approach that relies on a formal model called Simple Hypermedia Model (SHM). SHM is used for representing a document for the purpose of validation. An SHM document is validated using a hybrid approach based on two complementary techniques. The first one captures the document's spatio-temporal layout in terms of its state throughout its execution by means of a rewrite theory, and validation is performed through model-checking. The second one captures the document's layout in terms of intervals and event occurrences by means of Satisfiability Modulo Theories (SMT) formulas, and validation is performed through SMT solving. Due to different characteristics of both approaches, each validation technique complements the other in terms of expressiveness of SHM and tests to be checked. We briefly present validation tools that use our approach. They were evaluated with real NCL documents and by usability tests.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2018:ICS, author = "Jie Wu and Haifeng Hu and Yi Wu", title = "Image Captioning via Semantic Guidance Attention and Consensus Selection Strategy", journal = j-TOMM, volume = "14", number = "4", pages = "87:1--87:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3271485", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, a series of attempts have incorporated spatial attention mechanisms into the task of image captioning, which achieves a remarkable improvement in the quality of generative captions. However, the traditional spatial attention mechanism adopts latent and delayed semantic representations to decide which area should be paid more attention to, resulting in inaccurate semantic guidance and the introduction of redundant information. In order to optimize the spatial attention mechanism, we propose the Semantic Guidance Attention (SGA) mechanism in this article. Specifically, SGA utilizes semantic word representations to provide an intuitive semantic guidance that focuses accurately on semantic-related regions. Moreover, we reduce the difficulty of generating fluent sentences by updating the attention information in time. At the same time, the beam search algorithm is widely used to predict words during sequence generation. This algorithm generates a sentence according to the probabilities of words, so it is easy to push out a generic sentence and discard some distinctive captions. In order to overcome this limitation, we design the Consensus Selection (CS) strategy to choose the most descriptive and informative caption, which is selected by the semantic similarity of captions instead of the probabilities of words. The consensus caption is determined by selecting the one with the highest cumulative semantic similarity with respect to the reference captions. Our proposed model (SGA-CS) is validated on Flickr30k and MSCOCO, which shows that SGA-CS outperforms state-of-the-art approaches. To our best knowledge, SGA-CS is the first attempt to jointly produce semantic attention guidance and select descriptive captions for image captioning tasks, achieving one of the best performance ratings among any cross-entropy training methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Strezoski:2018:OLS, author = "Gjorgji Strezoski and Marcel Worring", title = "{OmniArt}: a Large-scale Artistic Benchmark", journal = j-TOMM, volume = "14", number = "4", pages = "88:1--88:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3273022", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Baselines are the starting point of any quantitative multimedia research, and benchmarks are essential for pushing those baselines further. In this article, we present baselines for the artistic domain with a new benchmark dataset featuring over 2 million images with rich structured metadata dubbed OmniArt. OmniArt contains annotations for dozens of attribute types and features semantic context information through concepts, IconClass labels, color information, and (limited) object-level bounding boxes. For our dataset we establish and present baseline scores on multiple tasks such as artist attribution, creation period estimation, type, style, and school prediction. In addition to our metadata related experiments, we explore the color spaces of art through different types and evaluate a transfer learning object recognition pipeline.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Koch:2018:CYU, author = "Christian Koch and Moritz Lode and Denny Stohr and Amr Rizk and Ralf Steinmetz", title = "Collaborations on {YouTube}: From Unsupervised Detection to the Impact on Video and Channel Popularity", journal = j-TOMM, volume = "14", number = "4", pages = "89:1--89:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241054", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:45 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "YouTube is the most popular platform for streaming of user-generated videos. Nowadays, professional YouTubers are organized in so-called multichannel networks (MCNs). These networks offer services such as brand deals, equipment, and strategic advice in exchange for a share of the YouTubers' revenues. A dominant strategy to gain more subscribers and, hence, revenue is collaborating with other YouTubers. Yet, collaborations on YouTube have not been studied in a detailed quantitative manner. To close this gap, first, we collect a YouTube dataset covering video statistics over 3 months for 7,942 channels. Second, we design a framework for collaboration detection given a previously unknown number of persons featured in YouTube videos. We denote this framework, for the detection and analysis of collaborations in YouTube videos using a Deep Neural Network (DNN)-based approach, as CATANA. Third, we analyze about 2.4 years of video content and use CATANA to answer research questions guiding YouTubers and MCNs for efficient collaboration strategies. Thereby, we focus on (1) collaboration frequency and partner selectivity, (2) the influence of MCNs on channel collaborations, (3) collaborating channel types, and (4) the impact of collaborations on video and channel popularity. Our results show that collaborations are in many cases significantly beneficial regarding viewers and newly attracted subscribers for both collaborating channels, often showing more than 100\% popularity growth compared with noncollaboration videos.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2019:EQA, author = "Wei Zhang", title = "Efficient {QoE}-Aware Scheme for Video Quality Switching Operations in Dynamic Adaptive Streaming", journal = j-TOMM, volume = "15", number = "1", pages = "17:1--17:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3269494", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3269494", abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is a popular over-the-top video content distribution technique that adapts the streaming session according to the user's network condition typically in terms of downlink bandwidth. This video quality adaptation can be achieved by scaling the frame quality, spatial resolution or frame rate. Despite the flexibility on the video quality scaling methods, each of these quality scaling dimensions has varying effects on the Quality of Experience (QoE) for end users. Furthermore, in video streaming, the changes in motion over time along with the scaling method employed have an influence on QoE, hence the need to carefully tailor scaling methods to suit streaming applications and content type. In this work, we investigate an intelligent DASH approach for the latest video coding standard H.265 and propose a heuristic QoE-aware cost-efficient adaptation scheme that does not switch unnecessarily to the highest quality level but rather stays temporarily at an intermediate quality level in certain streaming scenarios. Such an approach achieves a comparable and consistent level of quality under impaired network conditions as commonly found in Internet and mobile networks while reducing bandwidth requirements and quality switching overhead. The rationale is based on our empirical experiments, which show that an increase in bitrate does not necessarily mean noticeable improvement in QoE. Furthermore, our work demonstrates that the Signal-to-Noise Ratio (SNR) and the spatial resolution scalability types are the best fit for our proposed algorithm. Finally, we demonstrate an innovative interaction between quality scaling methods and the polarity of switching operations. The proposed QoE-aware scheme is implemented and empirical results show that it is able to reduce bandwidth requirements by up to 41\% whilst achieving equivalent QoE compared with a representative DASH reference implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yahia:2019:HBF, author = "Mariem {Ben Yahia} and Yannick {Le Louedec} and Gwendal Simon and Loutfi Nuaymi and Xavier Corbillon", title = "{HTTP/2}-based Frame Discarding for Low-Latency Adaptive Video Streaming", journal = j-TOMM, volume = "15", number = "1", pages = "18:1--18:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280854", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3280854", abstract = "In this article, we propose video delivery schemes insuring around 1s delivery latency with Dynamic Adaptive Streaming over HTTP (DASH), which is a standard version of HTTP Live Streaming (HLS), so as to benefit from the video representation switching between successive video segments. We also propose HTTP/2-based algorithms to apply video frame discarding policies inside a video segment when a selected DASH representation does not match with the available network resources. The current solutions with small buffer suffer from rebuffering events. Rebuffering not only impacts the Quality of Experience (QoE) but also increases the delivery delay between the displayed and the original video streams. In this work, we completely eliminate rebuffering events by developing optimal and practical video frame discarding algorithms to meet the 1s latency constraint. In all our algorithms, we request the video frames individually through HTTP/2 multiple streams, and we selectively drop the least meaningful video frames thanks to HTTP/2 stream resetting feature. Our simulations show that the proposed algorithms eliminate rebuffering while insuring an acceptable video quality with at least a Peak Signal to Noise Ratio (PSNR) of 35dB compared to 25dB of the basic First In First Out (FIFO) algorithm. We also quantify and qualify the resulting temporal distortion of the video segments per algorithm. An important number of missing video frames results in a temporal fluidity break known as video jitter. The displayed video looks like a series of snapshots. We show that both the optimal Integer Linear Program (ILP) and practical algorithms decrease the frequency and duration of the jitters. For example, practical algorithms reduce the number of crashed displayed videos (presenting one jitter longer than 1,350ms) with 22\% compared to the basic FIFO algorithm. We also show that requesting video frames separately with HTTP/2 slightly increases the overhead from 4.34\% to 5.76\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2019:SRC, author = "Xianguo Li and Yemei Sun and Yanli Yang and Changyun Miao", title = "Symmetrical Residual Connections for Single Image Super-Resolution", journal = j-TOMM, volume = "15", number = "1", pages = "19:1--19:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3282445", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3282445", abstract = "Single-image super-resolution (SISR) methods based on convolutional neural networks (CNN) have shown great potential in the literature. However, most deep CNN models don't have direct access to subsequent layers, seriously hindering the information flow. Furthermore, they fail to make full use of the hierarchical features from different low-level layers, thereby resulting in relatively low accuracy. In this article, we present a new SISR CNN, called SymSR, which incorporates symmetrical nested residual connections to improve both the accuracy and the execution speed. SymSR takes a larger image region for contextual spreading. It symmetrically combines multiple short paths for the forward propagation to improve the accuracy and for the backward propagation of gradient flow to accelerate the convergence speed. Extensive experiments based on open challenge datasets show the effectiveness of symmetrical residual connections. Compared with four other state-of-the-art super-resolution CNN methods, SymSR is superior in both accuracy and runtime.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yu:2019:DCM, author = "Yi Yu and Suhua Tang and Francisco Raposo and Lei Chen", title = "Deep Cross-Modal Correlation Learning for Audio and Lyrics in Music Retrieval", journal = j-TOMM, volume = "15", number = "1", pages = "20:1--20:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3281746", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3281746", abstract = "Deep cross-modal learning has successfully demonstrated excellent performance in cross-modal multimedia retrieval, with the aim of learning joint representations between different data modalities. Unfortunately, little research focuses on cross-modal correlation learning where temporal structures of different data modalities, such as audio and lyrics, should be taken into account. Stemming from the characteristic of temporal structures of music in nature, we are motivated to learn the deep sequential correlation between audio and lyrics. In this work, we propose a deep cross-modal correlation learning architecture involving two-branch deep neural networks for audio modality and text modality (lyrics). Data in different modalities are converted to the same canonical space where intermodal canonical correlation analysis is utilized as an objective function to calculate the similarity of temporal structures. This is the first study that uses deep architectures for learning the temporal correlation between audio and lyrics. A pretrained Doc2Vec model followed by fully connected layers is used to represent lyrics. Two significant contributions are made in the audio branch, as follows: (i) We propose an end-to-end network to learn cross-modal correlation between audio and lyrics, where feature extraction and correlation learning are simultaneously performed and joint representation is learned by considering temporal structures. (ii) And, as for feature extraction, we further represent an audio signal by a short sequence of local summaries (VGG16 features) and apply a recurrent neural network to compute a compact feature that better learns the temporal structures of music audio. Experimental results, using audio to retrieve lyrics or using lyrics to retrieve audio, verify the effectiveness of the proposed deep correlation learning architectures in cross-modal music retrieval.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Sun:2019:ERF, author = "Jia Sun and Di Huang and Yunhong Wang and Liming Chen", title = "Expression Robust {$3$D} Facial Landmarking via Progressive Coarse-to-Fine Tuning", journal = j-TOMM, volume = "15", number = "1", pages = "21:1--21:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3282833", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3282833", abstract = "Facial landmarking is a fundamental task in automatic machine-based face analysis. The majority of existing techniques for such a problem are based on 2D images; however, they suffer from illumination and pose variations that may largely degrade landmarking performance. The emergence of 3D data theoretically provides an alternative to overcome these weaknesses in the 2D domain. This article proposes a novel approach to 3D facial landmarking, which combines both the advantages of feature-based methods as well as model-based ones in a progressive three-stage coarse-to-fine manner (initial, intermediate, and fine stages). For the initial stage, a few fiducial landmarks (i.e., the nose tip and two inner eye corners) are robustly detected through curvature analysis, and these points are further exploited to initialize the subsequent stage. For the intermediate stage, a statistical model is learned in the feature space of three normal components of the facial point-cloud rather than the smooth original coordinates, namely Active Normal Model (ANM). For the fine stage, cascaded regression is employed to locally refine the landmarks according to their geometry attributes. The proposed approach can accurately localize dozens of fiducial points on each 3D face scan, greatly surpassing the feature-based ones, and it also improves the state of the art of the model-based ones in two aspects: sensitivity to initialization and deficiency in discrimination. The proposed method is evaluated on the BU-3DFE, Bosphorus, and BU-4DFE databases, and competitive results are achieved in comparison with counterparts in the literature, clearly demonstrating its effectiveness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Peng:2019:CGC, author = "Yuxin Peng and Jinwei Qi", title = "{CM-GANs}: Cross-modal Generative Adversarial Networks for Common Representation Learning", journal = j-TOMM, volume = "15", number = "1", pages = "22:1--22:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3284750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3284750", abstract = "It is known that the inconsistent distributions and representations of different modalities, such as image and text, cause the heterogeneity gap, which makes it very challenging to correlate heterogeneous data and measure their similarities. Recently, generative adversarial networks (GANs) have been proposed and have shown their strong ability to model data distribution and learn discriminative representation. It has also been shown that adversarial learning can be fully exploited to learn discriminative common representations for bridging the heterogeneity gap. Inspired by this, we aim to effectively correlate large-scale heterogeneous data of different modalities with the power of GANs to model cross-modal joint distribution. In this article, we propose Cross-modal Generative Adversarial Networks (CM-GANs) with the following contributions. First, a cross-modal GAN architecture is proposed to model joint distribution over the data of different modalities. The inter-modality and intra-modality correlation can be explored simultaneously in generative and discriminative models. Both compete with each other to promote cross-modal correlation learning. Second, the cross-modal convolutional autoencoders with weight-sharing constraint are proposed to form the generative model. They not only exploit the cross-modal correlation for learning the common representations but also preserve reconstruction information for capturing the semantic consistency within each modality. Third, a cross-modal adversarial training mechanism is proposed, which uses two kinds of discriminative models to simultaneously conduct intra-modality and inter-modality discrimination. They can mutually boost to make the generated common representations more discriminative by the adversarial training process. In summary, our proposed CM-GAN approach can use GANs to perform cross-modal common representation learning by which the heterogeneous data can be effectively correlated. Extensive experiments are conducted to verify the performance of CM-GANs on cross-modal retrieval compared with 13 state-of-the-art methods on 4 cross-modal datasets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pala:2019:RFM, author = "Pietro Pala and Stefano Berretti", title = "Reconstructing {$3$D} Face Models by Incremental Aggregation and Refinement of Depth Frames", journal = j-TOMM, volume = "15", number = "1", pages = "23:1--23:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3287309", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3287309", abstract = "Face recognition from two-dimensional (2D) still images and videos is quite successful even with ``in the wild'' conditions. Instead, less consolidated results are available for the cases in which face data come from non-conventional cameras, such as infrared or depth. In this article, we investigate this latter scenario assuming that a low-resolution depth camera is used to perform face recognition in an uncooperative context. To this end, we propose, first, to automatically select a set of frames from the depth sequence of the camera because they provide a good view of the face in terms of pose and distance. Then, we design a progressive refinement approach to reconstruct a higher-resolution model from the selected low-resolution frames. This process accounts for the anisotropic error of the existing points in the current 3D model and the points in a newly acquired frame so that the refinement step can progressively adjust the point positions in the model using a Kalman-like estimation. The quality of the reconstructed model is evaluated by considering the error between the reconstructed models and their corresponding high-resolution scans used as ground truth. In addition, we performed face recognition using the reconstructed models as probes against a gallery of reconstructed models and a gallery with high-resolution scans. The obtained results confirm the possibility to effectively use the reconstructed models for the face recognition task.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2019:OCT, author = "Han Hu and Yichao Jin and Yonggang Wen and Cedric Westphal", title = "Orchestrating Caching, Transcoding and Request Routing for Adaptive Video Streaming Over {ICN}", journal = j-TOMM, volume = "15", number = "1", pages = "24:1--24:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3289184", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3289184", abstract = "Information-centric networking (ICN) has been touted as a revolutionary solution for the future of the Internet, which will be dominated by video traffic. This work investigates the challenge of distributing video content of adaptive bitrate (ABR) over ICN. In particular, we use the in-network caching capability of ICN routers to serve users; in addition, with the help of named function, we enable ICN routers to transcode videos to lower-bitrate versions to improve the cache hit ratio. Mathematically, we formulate this design challenge into a constrained optimization problem, which aims to maximize the cache hit ratio for service providers and minimize the service delay for endusers. We design a two-step iterative algorithm to find the optimum. First, given a content management scheme, we minimize the service delay via optimally configuring the routing scheme. Second, we maximize the cache hits for a given routing policy. Finally, we rigorously prove its convergence. Through extensive simulations, we verify the convergence and the performance gains over other algorithms. We also find that more resources should be allocated to ICN routers with a heavier request rate, and the routing scheme favors the shortest path to schedule more traffic.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yuan:2019:DLT, author = "Bo Yuan and Xinbo Gao and Zhenxing Niu and Qi Tian", title = "Discovering Latent Topics by {Gaussian} Latent {Dirichlet} Allocation and Spectral Clustering", journal = j-TOMM, volume = "15", number = "1", pages = "25:1--25:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3290047", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3290047", abstract = "Today, diversifying the retrieval results of a certain query will improve customers' search efficiency. Showing the multiple aspects of information provides users an overview of the object, which helps them fast target their demands. To discover aspects, research focuses on generating image clusters from initially retrieved results. As an effective approach, latent Dirichlet allocation (LDA) has been proved to have good performance on discovering high-level topics. However, traditional LDA is designed to process textual words, and it needs the input as discrete data. When we apply this algorithm to process continuous visual images, a common solution is to quantize the continuous features into discrete form by a bag-of-visual-words algorithm. During this process, quantization error will lead to information that inevitably is lost. To construct a topic model with complete visual information, this work applies Gaussian latent Dirichlet allocation (GLDA) on the diversity issue of image retrieval. In this model, traditional multinomial distribution is substituted with Gaussian distribution to model continuous visual features. In addition, we propose a two-phase spectral clustering strategy, called dual spectral clustering, to generate clusters from region level to image level. The experiments on the challenging landmarks of the DIV400 database show that our proposal improves relevance and diversity by about 10\% compared to traditional topic models.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{He:2019:ICV, author = "Chen He and Haifeng Hu", title = "Image Captioning With Visual-Semantic Double Attention", journal = j-TOMM, volume = "15", number = "1", pages = "26:1--26:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3292058", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3292058", abstract = "In this article, we propose a novel Visual-Semantic Double Attention (VSDA) model for image captioning. In our approach, VSDA consists of two parts: a modified visual attention model is used to extract sub-region image features, then a new SEmantic Attention (SEA) model is proposed to distill semantic features. Traditional attribute-based models always neglect the distinctive importance of each attribute word and fuse all of them into recurrent neural networks, resulting in abundant irrelevant semantic features. In contrast, at each timestep, our model selects the most relevant word that aligns with current context. In other words, the real power of VSDA lies in the ability of not only leveraging semantic features but also eliminating the influence of irrelevant attribute words to make the semantic guidance more precise. Furthermore, our approach solves the problem that visual attention models cannot boost generating non-visual words. Considering that visual and semantic features are complementary to each other, our model can leverage both of them to strengthen the generations of visual and non-visual words. Extensive experiments are conducted on famous datasets: MS COCO and Flickr30k. The results show that VSDA outperforms other methods and achieves promising performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2019:MII, author = "Ruoyu Liu and Yao Zhao and Shikui Wei and Liang Zheng and Yi Yang", title = "Modality-Invariant Image-Text Embedding for Image-Sentence Matching", journal = j-TOMM, volume = "15", number = "1", pages = "27:1--27:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3300939", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3300939", abstract = "Performing direct matching among different modalities (like image and text) can benefit many tasks in computer vision, multimedia, information retrieval, and information fusion. Most of existing works focus on class-level image-text matching, called cross-modal retrieval, which attempts to propose a uniform model for matching images with all types of texts, for example, tags, sentences, and articles (long texts). Although cross-model retrieval alleviates the heterogeneous gap among visual and textual information, it can provide only a rough correspondence between two modalities. In this article, we propose a more precise image-text embedding method, image-sentence matching, which can provide heterogeneous matching in the instance level. The key issue for image-text embedding is how to make the distributions of the two modalities consistent in the embedding space. To address this problem, some previous works on the cross-model retrieval task have attempted to pull close their distributions by employing adversarial learning. However, the effectiveness of adversarial learning on image-sentence matching has not been proved and there is still not an effective method. Inspired by previous works, we propose to learn a modality-invariant image-text embedding for image-sentence matching by involving adversarial learning. On top of the triplet loss--based baseline, we design a modality classification network with an adversarial loss, which classifies an embedding into either the image or text modality. In addition, the multi-stage training procedure is carefully designed so that the proposed network not only imposes the image-text similarity constraints by ground-truth labels, but also enforces the image and text embedding distributions to be similar by adversarial learning. Experiments on two public datasets (Flickr30k and MSCOCO) demonstrate that our method yields stable accuracy improvement over the baseline model and that our results compare favorably to the state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ma:2019:PFC, author = "Ruijun Ma and Haifeng Hu and Weixuan Wang and Jia Xu and Zhengming Li", title = "Photorealistic Face Completion with Semantic Parsing and Face Identity-Preserving Features", journal = j-TOMM, volume = "15", number = "1", pages = "28:1--28:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3300940", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3300940", abstract = "Tremendous progress on deep learning has shown exciting potential for a variety of face completion tasks. However, most learning-based methods are limited to handle general or structure specified face images (e.g., well-aligned faces). In this article, we propose a novel face completion algorithm, called Learning and Preserving Face Completion Network (LP-FCN), which simultaneously parses face images and extracts face identity-preserving (FIP) features. By tackling these two tasks in a mutually boosting way, the LP-FCN can guide an identity preserving inference and ensure pixel faithfulness of completed faces. In addition, we adopt a global discriminator and a local discriminator to distinguish real images from synthesized ones. By training with a combined identity preserving, semantic parsing and adversarial loss, the LP-FCN encourages the completion results to be semantically valid and visually consistent for more complicated image completion tasks. Experiments show that our approach obtains similar visual quality, but achieves better performance on unaligned faces completion and fine detailed synthesis against the state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Lokoc:2019:ISS, author = "Jakub Lokoc and Gregor Kovalc{\'\i}k and Bernd M{\"u}nzer and Klaus Sch{\"o}ffmann and Werner Bailer and Ralph Gasser and Stefanos Vrochidis and Phuong Anh Nguyen and Sitapa Rujikietgumjorn and Kai Uwe Barthel", title = "Interactive Search or Sequential Browsing? {A} Detailed Analysis of the {Video Browser Showdown 2018}", journal = j-TOMM, volume = "15", number = "1", pages = "29:1--29:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3295663", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3295663", abstract = "This work summarizes the findings of the 7th iteration of the Video Browser Showdown (VBS) competition organized as a workshop at the 24th International Conference on Multimedia Modeling in Bangkok. The competition focuses on video retrieval scenarios in which the searched scenes were either previously observed or described by another person (i.e., an example shot is not available). During the event, nine teams competed with their video retrieval tools in providing access to a shared video collection with 600 hours of video content. Evaluation objectives, rules, scoring, tasks, and all participating tools are described in the article. In addition, we provide some insights into how the different teams interacted with their video browsers, which was made possible by a novel interaction logging mechanism introduced for this iteration of the VBS. The results collected at the VBS evaluation server confirm that searching for one particular scene in the collection when given a limited time is still a challenging task for many of the approaches that were showcased during the event. Given only a short textual description, finding the correct scene is even harder. In ad hoc search with multiple relevant scenes, the tools were mostly able to find at least one scene, whereas recall was the issue for many teams. The logs also reveal that even though recent exciting advances in machine learning narrow the classical semantic gap problem, user-centric interfaces are still required to mediate access to specific content. Finally, open challenges and lessons learned are presented for future VBS events.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2019:ESI, author = "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb {El Saddik}", title = "Editorial to Special Issue on Deep Learning for Intelligent Multimedia Analytics", journal = j-TOMM, volume = "15", number = "1s", pages = "1:1--1:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3292059", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3292059", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2019:DLB, author = "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb {El Saddik}", title = "Deep Learning-Based Multimedia Analytics: a Review", journal = j-TOMM, volume = "15", number = "1s", pages = "2:1--2:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3279952", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3279952", abstract = "The multimedia community has witnessed the rise of deep learning-based techniques in analyzing multimedia content more effectively. In the past decade, the convergence of deep-learning and multimedia analytics has boosted the performance of several traditional tasks, such as classification, detection, and regression, and has also fundamentally changed the landscape of several relatively new areas, such as semantic segmentation, captioning, and content generation. This article aims to review the development path of major tasks in multimedia analytics and take a look into future directions. We start by summarizing the fundamental deep techniques related to multimedia analytics, especially in the visual domain, and then review representative high-level tasks powered by recent advances. Moreover, the performance review of popular benchmarks gives a pathway to technology advancement and helps identify both milestone works and future directions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Xie:2019:CAN, author = "Hongtao Xie and Shancheng Fang and Zheng-Jun Zha and Yating Yang and Yan Li and Yongdong Zhang", title = "Convolutional Attention Networks for Scene Text Recognition", journal = j-TOMM, volume = "15", number = "1s", pages = "3:1--3:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231737", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231737", abstract = "In this article, we present Convoluitional Attention Networks (CAN) for unconstrained scene text recognition. Recent dominant approaches for scene text recognition are mainly based on Convolutional Neural Networks (CNN) and Recurrent Neural Networks (RNN), where the CNN encodes images and the RNN generates character sequences. Our CAN is different from these methods; our CAN is completely built on CNN and includes an attention mechanism. The distinctive characteristics of our method include (i) CAN follows encoder-decoder architecture, in which the encoder is a deep two-dimensional CNN and the decoder is a one-dimensional CNN; (ii) the attention mechanism is applied in every convolutional layer of the decoder, and we propose a novel spatial attention method using average pooling; and (iii) position embeddings are equipped in both a spatial encoder and a sequence decoder to give our networks a sense of location. We conduct experiments on standard datasets for scene text recognition, including Street View Text, IIIT5K, and ICDAR datasets. The experimental results validate the effectiveness of different components and show that our convolutional-based method achieves state-of-the-art or competitive performance over prior works, even without the use of RNN.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2019:SAD, author = "Zhineng Chen and Shanshan Ai and Caiyan Jia", title = "Structure-Aware Deep Learning for Product Image Classification", journal = j-TOMM, volume = "15", number = "1s", pages = "4:1--4:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231742", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231742", abstract = "Automatic product image classification is a task of crucial importance with respect to the management of online retailers. Motivated by recent advancements of deep Convolutional Neural Networks (CNN) on image classification, in this work we revisit the problem in the context of product images with the existence of a predefined categorical hierarchy and attributes, aiming to leverage the hierarchy and attributes to improve classification accuracy. With these structure-aware clues, we argue that more advanced deep models could be developed beyond the flat one-versus-all classification performed by conventional CNNs. To this end, novel efforts of this work include a salient-sensitive CNN that gazes into the product foreground by inserting a dedicated spatial attention module; a multiclass regression-based refinement that is expected to predict more accurately by merging prediction scores from multiple preceding CNNs, each corresponding to a distinct classifier in the hierarchy; and a multitask deep learning architecture that effectively explores correlations among categories and attributes for categorical label prediction. Experimental results on nearly 1 million real-world product images basically validate the effectiveness of the proposed efforts individually and jointly, from which performance gains are observed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Jiang:2019:DPR, author = "Shuqiang Jiang and Gongwei Chen and Xinhang Song and Linhu Liu", title = "Deep Patch Representations with Shared Codebook for Scene Classification", journal = j-TOMM, volume = "15", number = "1s", pages = "5:1--5:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231738", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231738", abstract = "Scene classification is a challenging problem. Compared with object images, scene images are more abstract, as they are composed of objects. Object and scene images have different characteristics with different scales and composition structures. How to effectively integrate the local mid-level semantic representations including both object and scene concepts needs to be investigated, which is an important aspect for scene classification. In this article, the idea of a sharing codebook is introduced by organically integrating deep learning, concept feature, and local feature encoding techniques. More specifically, the shared local feature codebook is generated from the combined ImageNet1K and Places365 concepts (Mixed1365) using convolutional neural networks. As the Mixed1365 features cover all the semantic information including both object and scene concepts, we can extract a shared codebook from the Mixed1365 features, which only contain a subset of the whole 1,365 concepts with the same codebook size. The shared codebook can not only provide complementary representations without additional codebook training but also be adaptively extracted toward different scene classification tasks. A method of fusing the encoded features with both the original codebook and the shared codebook is proposed for scene classification. In this way, more comprehensive and representative image features can be generated for classification. Extensive experimentations conducted on two public datasets validate the effectiveness of the proposed method. Besides, some useful observations are also revealed to show the advantage of shared codebook.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2019:VCR, author = "Rui-Wei Zhao and Qi Zhang and Zuxuan Wu and Jianguo Li and Yu-Gang Jiang", title = "Visual Content Recognition by Exploiting Semantic Feature Map with Attention and Multi-task Learning", journal = j-TOMM, volume = "15", number = "1s", pages = "6:1--6:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231739", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231739", abstract = "Recent studies have shown that spatial relationships among objects are very important for visual recognition, since they can provide rich clues on object contexts within the images. In this article, we introduce a novel method to learn the Semantic Feature Map (SFM) with attention-based deep neural networks for image and video classification in an end-to-end manner, aiming to explicitly model the spatial object contexts within the images. In particular, we explicitly apply the designed gate units to the extracted object features for important objects selection and noise removal. These selected object features are then organized into the proposed SFM, which is a compact and discriminative representation with the spatial information among objects preserved. Finally, we employ either Fully Convolutional Networks (FCN) or Long-Short Term Memory (LSTM) as the classifiers on top of the SFM for content recognition. A novel multi-task learning framework with image classification loss, object localization loss, and grid labeling loss are also introduced to help better learn the model parameters. We conduct extensive evaluations and comparative studies to verify the effectiveness of the proposed approach on Pascal VOC 2007/2012 and MS-COCO benchmarks for image classification. In addition, the experimental results also show that the SFMs learned from the image domain can be successfully transferred to CCV and FCVID benchmarks for video classification.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2019:CMF, author = "Xueliang Liu and Meng Wang and Zheng-Jun Zha and Richang Hong", title = "Cross-Modality Feature Learning via Convolutional Autoencoder", journal = j-TOMM, volume = "15", number = "1s", pages = "7:1--7:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231740", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231740", abstract = "Learning robust and representative features across multiple modalities has been a fundamental problem in machine learning and multimedia fields. In this article, we propose a novel MUltimodal Convolutional AutoEncoder (MUCAE) approach to learn representative features from visual and textual modalities. For each modality, we integrate the convolutional operation into an autoencoder framework to learn a joint representation from the original image and text content. We optimize the convolutional autoencoders of different modalities jointly by exploiting the correlation between the hidden representations from the convolutional autoencoders, in particular by minimizing both the reconstructing error of each modality and the correlation divergence between the hidden feature of different modalities. Compared to the conventional solutions relying on hand-crafted features, the proposed MUCAE approach encodes features from image pixels and text characters directly and produces more representative and robust features. We evaluate MUCAE on cross-media retrieval as well as unimodal classification tasks over real-world large-scale multimedia databases. Experimental results have shown that MUCAE performs better than the state-of-the-arts methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2019:DCN, author = "Jiawei Liu and Zheng-Jun Zha and Xuejin Chen and Zilei Wang and Yongdong Zhang", title = "Dense {$3$D}-Convolutional Neural Network for Person Re-Identification in Videos", journal = j-TOMM, volume = "15", number = "1s", pages = "8:1--8:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3231741", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3231741", abstract = "Person re-identification aims at identifying a certain pedestrian across non-overlapping multi-camera networks in different time and places. Existing person re-identification approaches mainly focus on matching pedestrians on images; however, little attention has been paid to re-identify pedestrians in videos. Compared to images, video clips contain motion patterns of pedestrians, which is crucial to person re-identification. Moreover, consecutive video frames present pedestrian appearance with different body poses and from different viewpoints, providing valuable information toward addressing the challenge of pose variation, occlusion, and viewpoint change, and so on. In this article, we propose a Dense 3D-Convolutional Network (D3DNet) to jointly learn spatio-temporal and appearance representation for person re-identification in videos. The D3DNet consists of multiple three-dimensional (3D) dense blocks and transition layers. The 3D dense blocks enlarge the receptive fields of visual neurons in both spatial and temporal dimensions, leading to discriminative appearance representation as well as short-term and long-term motion patterns of pedestrians without the requirement of an additional motion estimation module. Moreover, we formulate a loss function consisting of an identification loss and a center loss to minimize intra-class variance and maximize inter-class variance simultaneously, toward addressing the challenge of large intra-class variance and small inter-class variance. Extensive experiments on two real-world video datasets of person identification, i.e., MARS and iLIDS-VID, have shown the effectiveness of the proposed approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2019:DSM, author = "Liang Zhao and Zhikui Chen and Laurence T. Yang and M. Jamal Deen and Z. Jane Wang", title = "Deep Semantic Mapping for Heterogeneous Multimedia Transfer Learning Using Co-Occurrence Data", journal = j-TOMM, volume = "15", number = "1s", pages = "9:1--9:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3241055", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3241055", abstract = "Transfer learning, which focuses on finding a favorable representation for instances of different domains based on auxiliary data, can mitigate the divergence between domains through knowledge transfer. Recently, increasing efforts on transfer learning have employed deep neural networks (DNN) to learn more robust and higher level feature representations to better tackle cross-media disparities. However, only a few articles consider the correction and semantic matching between multi-layer heterogeneous domain networks. In this article, we propose a deep semantic mapping model for heterogeneous multimedia transfer learning (DHTL) using co-occurrence data. More specifically, we integrate the DNN with canonical correlation analysis (CCA) to derive a deep correlation subspace as the joint semantic representation for associating data across different domains. In the proposed DHTL, a multi-layer correlation matching network across domains is constructed, in which the CCA is combined to bridge each pair of domain-specific hidden layers. To train the network, a joint objective function is defined and the optimization processes are presented. When the deep semantic representation is achieved, the shared features of the source domain are transferred for task learning in the target domain. Extensive experiments for three multimedia recognition applications demonstrate that the proposed DHTL can effectively find deep semantic representations for heterogeneous domains, and it is superior to the several existing state-of-the-art methods for deep transfer learning.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hossain:2019:ADL, author = "M. Shamim Hossain and Syed Umar Amin and Mansour Alsulaiman and Ghulam Muhammad", title = "Applying Deep Learning for Epilepsy Seizure Detection and Brain Mapping Visualization", journal = j-TOMM, volume = "15", number = "1s", pages = "10:1--10:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3241056", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3241056", abstract = "Deep Convolutional Neural Network (CNN) has achieved remarkable results in computer vision tasks for end-to-end learning. We evaluate here the power of a deep CNN to learn robust features from raw Electroencephalogram (EEG) data to detect seizures. Seizures are hard to detect, as they vary both inter- and intra-patient. In this article, we use a deep CNN model for seizure detection task on an open-access EEG epilepsy dataset collected at the Boston Children's Hospital. Our deep learning model is able to extract spectral, temporal features from EEG epilepsy data and use them to learn the general structure of a seizure that is less sensitive to variations. For cross-patient EEG data, our method produced an overall sensitivity of 90.00\%, specificity of 91.65\%, and overall accuracy of 98.05\% for the whole dataset of 23 patients. The system can detect seizures with an accuracy of 99.46\%. Thus, it can be used as an excellent cross-patient seizure classifier. The results show that our model performs better than the previous state-of-the-art models for patient-specific and cross-patient seizure detection task. The method gave an overall accuracy of 99.65\% for patient-specific data. The system can also visualize the special orientation of band power features. We use correlation maps to relate spectral amplitude features to the output in the form of images. By using the results from our deep learning model, this visualization method can be used as an effective multimedia tool for producing quick and relevant brain mapping images that can be used by medical experts for further investigation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Alameda-Pineda:2019:SSM, author = "Xavier Alameda-Pineda and Miriam Redi and Mohammad Soleymani and Nicu Sebe and Shih-Fu Chang and Samuel Gosling", title = "Special Section on Multimodal Understanding of Social, Affective, and Subjective Attributes", journal = j-TOMM, volume = "15", number = "1s", pages = "11:1--11:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3292061", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3292061", abstract = "Multimedia scientists have largely focused their research on the recognition of tangible properties of data such as objects and scenes. Recently, the field has started evolving toward the modeling of more complex properties. For example, the understanding of social, affective, and subjective attributes of visual data has attracted the attention of many research teams at the crossroads of computer vision, multimedia, and social sciences. These intangible attributes include, for example, visual beauty, video popularity, or user behavior. Multiple, diverse challenges arise when modeling such properties from multimedia data. The sections concern technical aspects such as reliable groundtruth collection, the effective learning of subjective properties, or the impact of context in subjective perception; see Refs. [2] and [3].", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2019:VPI, author = "Chuan-Shen Hu and Yi-Tsung Hsieh and Hsiao-Wei Lin and Mei-Chen Yeh", title = "{Virtual Portraitist}: an Intelligent Tool for Taking Well-Posed Selfies", journal = j-TOMM, volume = "15", number = "1s", pages = "12:1--12:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3288760", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3288760", abstract = "Smart photography carries the promise of quality improvement and functionality extension in making aesthetically appealing pictures. In this article, we focus on self-portrait photographs and introduce new methods that guide a user in how to best pose while taking a selfie. While most of the current solutions use a post processing procedure to beautify a picture, the developed tool enables a novel function of recommending a good look before the photo is captured. Given an input face image, the tool automatically estimates the pose-based aesthetic score, finds the most attractive angle of the face, and suggests how the pose should be adjusted. The recommendation results are determined adaptively to the appearance and initial pose of the input face. We apply a data mining approach to find distinctive, frequent itemsets and association rules from online profile pictures, upon which the aesthetic estimation and pose recommendation methods are developed. A simulated and a real image set are used for experimental evaluation. The results show the proposed aesthetic estimation method can effectively select user-favorable photos. Moreover, the recommendation performance for the vertical adjustment is moderately related to the degree of conformity among the professional photographers' recommendations. This study echoes the trend of instant photo sharing, in which a user takes a picture and then immediately shares it on a social network without engaging in tedious editing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Okada:2019:MDG, author = "Shogo Okada and Laurent Son Nguyen and Oya Aran and Daniel Gatica-Perez", title = "Modeling Dyadic and Group Impressions with Intermodal and Interperson Features", journal = j-TOMM, volume = "15", number = "1s", pages = "13:1--13:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3265754", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3265754", abstract = "This article proposes a novel feature-extraction framework for inferring impression personality traits, emergent leadership skills, communicative competence, and hiring decisions. The proposed framework extracts multimodal features, describing each participant's nonverbal activities. It captures intermodal and interperson relationships in interactions and captures how the target interactor generates nonverbal behavior when other interactors also generate nonverbal behavior. The intermodal and interperson patterns are identified as frequent co-occurring events based on clustering from multimodal sequences. The proposed framework is applied to the SONVB corpus, which is an audiovisual dataset collected from dyadic job interviews, and the ELEA audiovisual data corpus, which is a dataset collected from group meetings. We evaluate the framework on a binary classification task involving 15 impression variables from the two data corpora. The experimental results show that the model trained with co-occurrence features is more accurate than previous models for 14 out of 15 traits.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhao:2019:PER, author = "Sicheng Zhao and Amir Gholaminejad and Guiguang Ding and Yue Gao and Jungong Han and Kurt Keutzer", title = "Personalized Emotion Recognition by Personality-Aware High-Order Learning of Physiological Signals", journal = j-TOMM, volume = "15", number = "1s", pages = "14:1--14:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3233184", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3233184", abstract = "Due to the subjective responses of different subjects to physical stimuli, emotion recognition methodologies from physiological signals are increasingly becoming personalized. Existing works mainly focused on modeling the involved physiological corpus of each subject, without considering the psychological factors, such as interest and personality. The latent correlation among different subjects has also been rarely examined. In this article, we propose to investigate the influence of personality on emotional behavior in a hypergraph learning framework. Assuming that each vertex is a compound tuple (subject, stimuli), multi-modal hypergraphs can be constructed based on the personality correlation among different subjects and on the physiological correlation among corresponding stimuli. To reveal the different importance of vertices, hyperedges, and modalities, we learn the weights for each of them. As the hypergraphs connect different subjects on the compound vertices, the emotions of multiple subjects can be simultaneously recognized. In this way, the constructed hypergraphs are vertex-weighted multi-modal multi-task ones. The estimated factors, referred to as emotion relevance, are employed for emotion recognition. We carry out extensive experiments on the ASCERTAIN dataset and the results demonstrate the superiority of the proposed method, as compared to the state-of-the-art emotion recognition approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Trabelsi:2019:UDS, author = "Rim Trabelsi and Jagannadan Varadarajan and Le Zhang and Issam Jabri and Yong Pei and Fethi Smach and Ammar Bouallegue and Pierre Moulin", title = "Understanding the Dynamics of Social Interactions: a Multi-Modal Multi-View Approach", journal = j-TOMM, volume = "15", number = "1s", pages = "15:1--15:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3300937", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3300937", abstract = "In this article, we deal with the problem of understanding human-to-human interactions as a fundamental component of social events analysis. Inspired by the recent success of multi-modal visual data in many recognition tasks, we propose a novel approach to model dyadic interaction by means of features extracted from synchronized 3D skeleton coordinates, depth, and Red Green Blue (RGB) sequences. From skeleton data, we extract new view-invariant proxemic features, named Unified Proxemic Descriptor (UProD), which is able to incorporate intrinsic and extrinsic distances between two interacting subjects. A novel key frame selection method is introduced to identify salient instants of the interaction sequence based on the joints' energy. From Red Green Blue Depth (RGBD) videos, more holistic CNN features are extracted by applying an adaptive pre-trained Convolutional Neural Networks (CNNs) on optical flow frames. For better understanding the dynamics of interactions, we expand the boundaries of dyadic interactions analysis by proposing a fundamentally new modeling for non-treated problem aiming to discern the active from the passive interactor. Extensive experiments have been carried out on four multi-modal and multi-view interactions datasets. The experimental results demonstrate the superiority of our proposed techniques against the state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gan:2019:MSF, author = "Tian Gan and Junnan Li and Yongkang Wong and Mohan S. Kankanhalli", title = "A Multi-sensor Framework for Personal Presentation Analytics", journal = j-TOMM, volume = "15", number = "2", pages = "30:1--30:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3300941", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3300941", abstract = "Presentation has been an effective method for delivering information to an audience for many years. Over the past few decades, technological advancements have revolutionized the way humans deliver presentation. Conventionally, the quality of a presentation is usually evaluated through painstaking manual analysis with experts. Although the expert feedback is effective in assisting users to improve their presentation skills, manual evaluation suffers from high cost and is often not available to most individuals. In this work, we propose a novel multi-sensor self-quantification system for presentations, which is designed based on a new proposed assessment rubric. We present our analytics model with conventional ambient sensors (i.e., static cameras and Kinect sensor) and the emerging wearable egocentric sensors (i.e., Google Glass). In addition, we performed a cross-correlation analysis of speaker's vocal behavior and body language. The proposed framework is evaluated on a new presentation dataset, namely, NUS Multi-Sensor Presentation dataset, which consists of 51 presentations covering a diverse range of topics. To validate the efficacy of the proposed system, we have conducted a series of user studies with the speakers and an interview with an English communication expert, which reveals positive and promising feedback.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Tang:2019:RVL, author = "Pengjie Tang and Hanli Wang and Qinyu Li", title = "Rich Visual and Language Representation with Complementary Semantics for Video Captioning", journal = j-TOMM, volume = "15", number = "2", pages = "31:1--31:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3303083", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3303083", abstract = "It is interesting and challenging to translate a video to natural description sentences based on the video content. In this work, an advanced framework is built to generate sentences with coherence and rich semantic expressions for video captioning. A long short term memory (LSTM) network with an improved factored way is first developed, which takes the inspiration of LSTM with a conventional factored way and a common practice to feed multi-modal features into LSTM at the first time step for visual description. Then, the incorporation of the LSTM network with the proposed improved factored way and un-factored way is exploited, and a voting strategy is utilized to predict candidate words. In addition, for robust and abstract visual and language representation, residuals are employed to enhance the gradient signals that are learned from the residual network (ResNet), and a deeper LSTM network is constructed. Furthermore, three convolutional neural network based features extracted from GoogLeNet, ResNet101, and ResNet152, are fused to catch more comprehensive and complementary visual information. Experiments are conducted on two benchmark datasets, including MSVD and MSR-VTT2016, and competitive performances are obtained by the proposed techniques as compared to other state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2019:MLS, author = "Chen Shen and Zhongming Jin and Wenqing Chu and Rongxin Jiang and Yaowu Chen and Guo-Jun Qi and Xian-Sheng Hua", title = "Multi-level Similarity Perception Network for Person Re-identification", journal = j-TOMM, volume = "15", number = "2", pages = "32:1--32:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309881", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309881", abstract = "In this article, we propose a novel deep Siamese architecture based on a convolutional neural network (CNN) and multi-level similarity perception for the person re-identification (re-ID) problem. According to the distinct characteristics of diverse feature maps, we effectively apply different similarity constraints to both low-level and high-level feature maps during training stage. Due to the introduction of appropriate similarity comparison mechanisms at different levels, the proposed approach can adaptively learn discriminative local and global feature representations, respectively, while the former is more sensitive in localizing part-level prominent patterns relevant to re-identifying people across cameras. Meanwhile, a novel strong activation pooling strategy is utilized on the last convolutional layer for abstract local-feature aggregation to pursue more representative feature representations. Based on this, we propose final feature embedding by simultaneously encoding original global features and discriminative local features. In addition, our framework has two other benefits: First, classification constraints can be easily incorporated into the framework, forming a unified multi-task network with similarity constraints. Second, as similarity-comparable information has been encoded in the network's learning parameters via back-propagation, pairwise input is not necessary at test time. That means we can extract features of each gallery image and build an index in an off-line manner, which is essential for large-scale real-world applications. Experimental results on multiple challenging benchmarks demonstrate that our method achieves splendid performance compared with the current state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Miao:2019:DLS, author = "Yu Miao and Haiwei Dong and Jihad Mohamad {Al Jaam} and Abdulmotaleb {El Saddik}", title = "A Deep Learning System for Recognizing Facial Expression in Real-Time", journal = j-TOMM, volume = "15", number = "2", pages = "33:1--33:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311747", abstract = "This article presents an image-based real-time facial expression recognition system that is able to recognize the facial expressions of several subjects on a webcam at the same time. Our proposed methodology combines a supervised transfer learning strategy and a joint supervision method with center loss, which is crucial for facial tasks. A newly proposed Convolutional Neural Network (CNN) model, MobileNet, which has both accuracy and speed, is deployed in both offline and in a real-time framework that enables fast and accurate real-time output. Evaluations towards two publicly available datasets, JAFFE and CK+, are carried out respectively. The JAFFE dataset reaches an accuracy of 95.24\%, while an accuracy of 96.92\% is achieved on the 6-class CK+ dataset, which contains only the last frames of image sequences. At last, the average run-time cost for the recognition of the real-time implementation is around 3.57ms/frame on a NVIDIA Quadro K4200 GPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mesfin:2019:UET, author = "Gebremariam Mesfin and Nadia Hussain and Alexandra Covaci and Gheorghita Ghinea", title = "Using Eye Tracking and Heart-Rate Activity to Examine Crossmodal Correspondences {QoE} in {Mulsemedia}", journal = j-TOMM, volume = "15", number = "2", pages = "34:1--34:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3303080", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3303080", abstract = "Different senses provide us with information of various levels of precision and enable us to construct a more precise representation of the world. Rich multisensory simulations are thus beneficial for comprehension, memory reinforcement, or retention of information. Crossmodal mappings refer to the systematic associations often made between different sensory modalities (e.g., high pitch is matched with angular shapes) and govern multisensory processing. A great deal of research effort has been put into exploring cross-modal correspondences in the field of cognitive science. However, the possibilities they open in the digital world have been relatively unexplored. Multiple sensorial media (mulsemedia) provides a highly immersive experience to the users and enhances their Quality of Experience (QoE) in the digital world. Thus, we consider that studying the plasticity and the effects of cross-modal correspondences in a mulsemedia setup can bring interesting insights about improving the human computer dialogue and experience. In our experiments, we exposed users to videos with certain visual dimensions (brightness, color, and shape), and we investigated whether the pairing with a cross-modal matching sound (high and low pitch) and the corresponding auto-generated vibrotactile effects (produced by a haptic vest) lead to an enhanced QoE. For this, we captured the eye gaze and the heart rate of users while experiencing mulsemedia, and we asked them to fill in a set of questions targeting their enjoyment and perception at the end of the experiment. Results showed differences in eye-gaze patterns and heart rate between the experimental and the control group, indicating changes in participants' engagement when videos were accompanied by matching cross-modal sounds (this effect was the strongest for the video displaying angular shapes and high-pitch audio) and transitively generated cross-modal vibrotactile effects.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cheung:2019:DOC, author = "Ming Cheung and James She and Weiwei Sun and Jiantao Zhou", title = "Detecting Online Counterfeit-goods Seller using Connection Discovery", journal = j-TOMM, volume = "15", number = "2", pages = "35:1--35:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311785", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311785", abstract = "With the advancement of social media and mobile technology, any smartphone user can easily become a seller on social media and e-commerce platforms, such as Instagram and Carousell in Hong Kong or Taobao in China. A seller shows images of their products and annotates their images with suitable tags that can be searched easily by others. Those images could be taken by the seller, or the seller could use images shared by other sellers. Among sellers, some sell counterfeit goods, and these sellers may use disguising tags and language, which make detecting them a difficult task. This article proposes a framework to detect counterfeit sellers by using deep learning to discover connections among sellers from their shared images. Based on 473K shared images from Taobao, Instagram, and Carousell, it is proven that the proposed framework can detect counterfeit sellers. The framework is 30\% better than approaches using object recognition in detecting counterfeit sellers. To the best of our knowledge, this is the first work to detect online counterfeit sellers from their shared images.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yarnagula:2019:QMC, author = "Hema Kumar Yarnagula and Parikshit Juluri and Sheyda Kiani Mehr and Venkatesh Tamarapalli and Deep Medhi", title = "{QoE} for Mobile Clients with {Segment-aware Rate Adaptation Algorithm (SARA)} for {DASH} Video Streaming", journal = j-TOMM, volume = "15", number = "2", pages = "36:1--36:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311749", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311749", abstract = "Dynamic adaptive streaming over HTTP (DASH) is widely used for video streaming on mobile devices. Ensuring a good quality of experience (QoE) for mobile video streaming is essential, as it severely impacts both the network and content providers' revenue. Thus, a good rate adaptation algorithm at the client end that provides high QoE is critically important. Recently, a segment size-aware rate adaptation (SARA) algorithm was proposed for DASH clients. However, its performance on mobile clients has not been investigated so far. The main contributions of this article are twofold: (1) We discuss SARA's implementation for mobile clients to improve the QoE in mobile video streaming, one that accurately predicts the download time for the next segment and makes an informed bitrate selection, and (2) we developed a new parametric QoE model to compute a cumulative score that helps in fair comparison of different adaptation algorithms. Based on our subjective and objective evaluation, we observed that SARA for mobile clients outperforms others by 17\% on average, in terms of the Mean Opinion Score, while achieving, on average, a 76\% improvement in terms of the interruption ratio. The score obtained from our new parametric QoE model also demonstrates that the SARA algorithm for mobile clients gives a better QoE among all the algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Atrey:2019:WMD, author = "Pradeep K. Atrey and Bakul Trehan and Mukesh K. Saini", title = "Watch Me from Distance {(WMD)}: a Privacy-Preserving Long-Distance Video Surveillance System", journal = j-TOMM, volume = "15", number = "2", pages = "37:1--37:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3312574", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3312574", abstract = "Preserving the privacy of people in video surveillance systems is quite challenging, and a significant amount of research has been done to solve this problem in recent times. Majority of existing techniques are based on detecting bodily cues such as face and/or silhouette and obscuring them so that people in the videos cannot be identified. We observe that merely hiding bodily cues is not enough for protecting identities of the individuals in the videos. An adversary, who has prior contextual knowledge about the surveilled area, can identify people in the video by exploiting the implicit inference channels such as behavior, place, and time. This article presents an anonymous surveillance system, called Watch Me from Distance (WMD), which advocates for outsourcing of surveillance video monitoring (similar to call centers) to the long-distance sites where professional security operators watch the video and alert the local site when any suspicious or abnormal event takes place. We find that long-distance monitoring helps in decoupling the contextual knowledge of security operators. Since security operators at the remote site could turn into adversaries, a trust computation model to determine the credibility of the operators is presented as an integral part of the proposed system. The feasibility study and experiments suggest that the proposed system provides more robust measures of privacy yet maintains surveillance effectiveness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hsu:2019:LMC, author = "Chih-Fan Hsu and Yu-Shuen Wang and Chin-Laung Lei and Kuan-Ta Chen", title = "Look at Me! {Correcting} Eye Gaze in Live Video Communication", journal = j-TOMM, volume = "15", number = "2", pages = "38:1--38:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311784", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311784", abstract = "Although live video communication is widely used, it is generally less engaging than face-to-face communication because of limitations on social, emotional, and haptic feedback. Missing eye contact is one such problem caused by the physical deviation between the screen and camera on a device. Manipulating video frames to correct eye gaze is a solution to this problem. In this article, we introduce a system to rotate the eyeball of a local participant before the video frame is sent to the remote side. It adopts a warping-based convolutional neural network to relocate pixels in eye regions. To improve visual quality, we minimize the L2 distance between the ground truths and warped eyes. We also present several newly designed loss functions to help network training. These new loss functions are designed to preserve the shape of eye structures and minimize color changes around the periphery of eye regions. To evaluate the presented network and loss functions, we objectively and subjectively compared results generated by our system and the state-of-the-art, DeepWarp, in relation to two datasets. The experimental results demonstrated the effectiveness of our system. In addition, we showed that our system can perform eye-gaze correction in real time on a consumer-level laptop. Because of the quality and efficiency of the system, gaze correction by postprocessing through this system is a feasible solution to the problem of missing eye contact in video communication.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ahmad:2019:HDF, author = "Kashif Ahmad and Nicola Conci", title = "How Deep Features Have Improved Event Recognition in Multimedia: a Survey", journal = j-TOMM, volume = "15", number = "2", pages = "39:1--39:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3306240", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3306240", abstract = "Event recognition is one of the areas in multimedia that is attracting great attention of researchers. Being applicable in a wide range of applications, from personal to collective events, a number of interesting solutions for event recognition using multimedia information sources have been proposed. On the other hand, following their immense success in classification, object recognition, and detection, deep learning has been shown to perform well in event recognition tasks also. Thus, a large portion of the literature on event analysis relies nowadays on deep learning architectures. In this article, we provide an extensive overview of the existing literature in this field, analyzing how deep features and deep learning architectures have changed the performance of event recognition frameworks. The literature on event-based analysis of multimedia contents can be categorized into four groups, namely (i) event recognition in single images; (ii) event recognition in personal photo collections; (iii) event recognition in videos; and (iv) event recognition in audio recordings. In this article, we extensively review different deep-learning-based frameworks for event recognition in these four domains. Furthermore, we also review some benchmark datasets made available to the scientific community to validate novel event recognition pipelines. In the final part of the manuscript, we also provide a detailed discussion on basic insights gathered from the literature review, and identify future trends and challenges.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2019:ACV, author = "Yadang Chen and Chuanyan Hao and Alex X. Liu and Enhua Wu", title = "Appearance-consistent Video Object Segmentation Based on a Multinomial Event Model", journal = j-TOMM, volume = "15", number = "2", pages = "40:1--40:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321507", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321507", abstract = "In this study, we propose an effective and efficient algorithm for unconstrained video object segmentation, which is achieved in a Markov random field (MRF). In the MRF graph, each node is modeled as a superpixel and labeled as either foreground or background during the segmentation process. The unary potential is computed for each node by learning a transductive SVM classifier under supervision by a few labeled frames. The pairwise potential is used for the spatial-temporal smoothness. In addition, a high-order potential based on the multinomial event model is employed to enhance the appearance consistency throughout the frames. To minimize this intractable feature, we also introduce a more efficient technique that simply extends the original MRF structure. The proposed approach was evaluated in experiments with different measures and the results based on a benchmark demonstrated its effectiveness compared with other state-of-the-art algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Roberto:2019:DLS, author = "Pierdicca Roberto and Frontoni Emanuele and Zingaretti Primo and Mancini Adriano and Loncarski Jelena and Paolanti Marina", title = "Design, Large-Scale Usage Testing, and Important Metrics for Augmented Reality Gaming Applications", journal = j-TOMM, volume = "15", number = "2", pages = "41:1--41:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311748", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311748", abstract = "Augmented Reality (AR) offers the possibility to enrich the real world with digital mediated content, increasing in this way the quality of many everyday experiences. While in some research areas such as cultural heritage, tourism, or medicine there is a strong technological investment, AR for game purposes struggles to become a widespread commercial application. In this article, a novel framework for AR kid games is proposed, already developed by the authors for other AR applications such as Cultural Heritage and Arts. In particular, the framework includes different layers such as the development of a series of AR kid puzzle games in an intermediate structure which can be used as a standard for different applications development, the development of a smart configuration tool, together with general guidelines and long-life usage tests and metrics. The proposed application is designed for augmenting the puzzle experience, but can be easily extended to other AR gaming applications. Once the user has assembled the real puzzle, AR functionality within the mobile application can be unlocked, bringing to life puzzle characters, creating a seamless game that merges AR interactions with the puzzle reality. The main goals and benefits of this framework can be seen in the development of a novel set of AR tests and metrics in the pre-release phase (in order to help the commercial launch and developers), and in the release phase by introducing the measures for long-life app optimization, usage tests and hint on final users together with a measure to design policy, providing a method for automatic testing of quality and popularity improvements. Moreover, smart configuration tools, as part of the general framework, enabling multi-app and eventually also multi-user development, have been proposed, facilitating the serialization of the applications. Results were obtained from a large-scale user test with about 4 million users on a set of eight gaming applications, providing the scientific community a workflow for implicit quantitative analysis in AR gaming. Different data analytics developed on the data collected by the framework prove that the proposed approach is affordable and reliable for long-life testing and optimization.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Siarohin:2019:IIM, author = "Aliaksandr Siarohin and Gloria Zen and Cveta Majtanovic and Xavier Alameda-Pineda and Elisa Ricci and Nicu Sebe", title = "Increasing Image Memorability with Neural Style Transfer", journal = j-TOMM, volume = "15", number = "2", pages = "42:1--42:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311781", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3311781", abstract = "Recent works in computer vision and multimedia have shown that image memorability can be automatically inferred exploiting powerful deep-learning models. This article advances the state of the art in this area by addressing a novel and more challenging issue: `` Given an arbitrary input image, can we make it more memorable? '' To tackle this problem, we introduce an approach based on an editing-by-applying-filters paradigm: given an input image, we propose to automatically retrieve a set of ``style seeds,'' i.e., a set of style images that, applied to the input image through a neural style transfer algorithm, provide the highest increase in memorability. We show the effectiveness of the proposed approach with experiments on the publicly available LaMem dataset, performing both a quantitative evaluation and a user study. To demonstrate the flexibility of the proposed framework, we also analyze the impact of different implementation choices, such as using different state-of-the-art neural style transfer methods. Finally, we show several qualitative results to provide additional insights on the link between image style and memorability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Do:2019:SDC, author = "Thanh-Toan Do and Tuan Hoang and Dang-Khoa Le Tan and Huu Le and Tam V. Nguyen and Ngai-Man Cheung", title = "From Selective Deep Convolutional Features to Compact Binary Representations for Image Retrieval", journal = j-TOMM, volume = "15", number = "2", pages = "43:1--43:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3314051", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3314051", abstract = "In the large-scale image retrieval task, the two most important requirements are the discriminability of image representations and the efficiency in computation and storage of representations. Regarding the former requirement, Convolutional Neural Network is proven to be a very powerful tool to extract highly discriminative local descriptors for effective image search. Additionally, to further improve the discriminative power of the descriptors, recent works adopt fine-tuned strategies. In this article, taking a different approach, we propose a novel, computationally efficient, and competitive framework. Specifically, we first propose various strategies to compute masks, namely, SIFT-masks, SUM-mask, and MAX-mask, to select a representative subset of local convolutional features and eliminate redundant features. Our in-depth analyses demonstrate that proposed masking schemes are effective to address the burstiness drawback and improve retrieval accuracy. Second, we propose to employ recent embedding and aggregating methods that can significantly boost the feature discriminability. Regarding the computation and storage efficiency, we include a hashing module to produce very compact binary image representations. Extensive experiments on six image retrieval benchmarks demonstrate that our proposed framework achieves the state-of-the-art retrieval performances.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2019:LCS, author = "Liquan Shen and Ping An and Guorui Feng", title = "Low-Complexity Scalable Extension of the High-Efficiency Video Coding {(SHVC)} Encoding System", journal = j-TOMM, volume = "15", number = "2", pages = "44:1--44:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3313185", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3313185", abstract = "The scalable extension of the high-efficiency video coding (SHVC) system adopts a hierarchical quadtree-based coding unit (CU) that is suitable for various texture and motion properties of videos. Currently, the test model of SHVC identifies the optimal CU size by performing an exhaustive quadtree depth-level search, which achieves a high compression efficiency at a heavy cost in terms of the computational complexity. However, many interactive multimedia applications, such as remote monitoring and video surveillance, which are sensitive to time delays, have insufficient computational power for coding high-definition (HD) and ultra-high-definition (UHD) videos. Therefore, it is important, yet challenging, to optimize the SHVC coding procedure and accelerate video coding. In this article, we propose a fast CU quadtree depth-level decision algorithm for inter-frames on enhancement layers that is based on an analysis of inter-layer, spatial, and temporal correlations. When motion/texture properties of coding regions can be identified early, a fast algorithm can be designed for adapting CU depth-level decision procedures to video contents and avoiding unnecessary computations during CU depth-level traversal. The proposed algorithm determines the motion activity level at the treeblock size of the hierarchical quadtree by utilizing motion vectors from its corresponding blocks at the base layer. Based on the motion activity level, neighboring encoded CUs that have larger correlations are preferentially selected to predict the optimal depth level of the current treeblock. Finally, two parameters, namely, the motion activity level and the predicted CU depth level, are used to identify a subset of candidate CU depth levels and adaptively optimize CU depth-level decision processes. The experimental results demonstrate that the proposed scheme can run approximately three times faster than the most recent SHVC reference software, with a negligible loss of compression efficiency. The proposed scheme is efficient for all types of scalable video sequences under various coding conditions and outperforms state-of-the-art fast SHVC and HEVC algorithms. Our scheme is a suitable candidate for interactive HD/UHD video applications that are expected to operate in real-time and power-constrained scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hu:2019:CAA, author = "Jun Hu and Shengsheng Qian and Quan Fang and Xueliang Liu and Changsheng Xu", title = "{A$^2$ CMHNE}: Attention-Aware Collaborative Multimodal Heterogeneous Network Embedding", journal = j-TOMM, volume = "15", number = "2", pages = "45:1--45:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321506", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321506", abstract = "Network representation learning is playing an important role in network analysis due to its effectiveness in a variety of applications. However, most existing network embedding models focus on homogeneous networks and neglect the diverse properties such as different types of network structures and associated multimedia content information. In this article, we learn node representations for multimodal heterogeneous networks, which contain multiple types of nodes and/or links as well as multimodal content such as texts and images. We propose a novel attention-aware collaborative multimodal heterogeneous network embedding method (A$^2$ CMHNE), where an attention-based collaborative representation learning approach is proposed to promote the collaboration of structure-based embedding and content-based embedding, and generate the robust node representation by introducing an attention mechanism that enables informative embedding integration. In experiments, we compare our model with existing network embedding models on two real-world datasets. Our method leads to dramatic improvements in performance by 5\%, and 9\% compared with five state-of-the-art embedding methods on one benchmark (M10 Dataset), and on a multi-modal heterogeneous network dataset (WeChat dataset) for node classification, respectively. Experimental results demonstrate the effectiveness of our proposed method on both node classification and link prediction tasks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hosny:2019:RCI, author = "Khalid M. Hosny and Mohamed M. Darwish", title = "Resilient Color Image Watermarking Using Accurate Quaternion Radial Substituted {Chebyshev} Moments", journal = j-TOMM, volume = "15", number = "2", pages = "46:1--46:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3325193", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3325193", abstract = "In this work, a new quaternion-based method for color image watermarking is proposed. In this method, a novel set of quaternion radial substituted Chebyshev moments (QRSCMs) is presented for robust geometrically invariant image watermarking. An efficient computational method is proposed for highly accurate, fast, and numerically stable QRSCMs in polar coordinates. The proposed watermarking method consists of three stages. In the first stage, the Arnold transform is used to improve the security of the watermarking scheme by scrambling the binary watermark. In the second stage, the proposed accurate and stable QRSCMs of the host color image are computed. In the third stage, the encrypted binary watermark is embedded into the host image by employing the quantization technique on selected-magnitude QRSCMs where the watermarked color image is obtained by adding the original host color image to the compensation image. Then, the binary watermark can be extracted directly without using the original image from the magnitudes of QRSCMs. Numerical experiments are performed where the performance of proposed method is compared with the existing quaternion moment-based watermarking methods. The comparison clearly shows that the proposed method is very efficient in terms of the visual imperceptibility capability and the robustness under different attacks compared to the existing quaternion moment-based watermarking algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Mou:2019:AVG, author = "Wenxuan Mou and Hatice Gunes and Ioannis Patras", title = "Alone versus In-a-group: a Multi-modal Framework for Automatic Affect Recognition", journal = j-TOMM, volume = "15", number = "2", pages = "47:1--47:??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321509", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:46 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321509", abstract = "Recognition and analysis of human affect has been researched extensively within the field of computer science in the past two decades. However, most of the past research in automatic analysis of human affect has focused on the recognition of affect displayed by people in individual settings and little attention has been paid to the analysis of the affect expressed in group settings. In this article, we first analyze the affect expressed by each individual in terms of arousal and valence dimensions in both individual and group videos and then propose methods to recognize the contextual information, i.e., whether a person is alone or in-a-group by analyzing their face and body behavioral cues. For affect analysis, we first devise affect recognition models separately in individual and group videos and then introduce a cross-condition affect recognition model that is trained by combining the two different types of data. We conduct a set of experiments on two datasets that contain both individual and group videos. Our experiments show that (1) the proposed Volume Quantized Local Zernike Moments Fisher Vector outperforms other unimodal features in affect analysis; (2) the temporal learning model, Long-Short Term Memory Networks, works better than the static learning model, Support Vector Machine; (3) decision fusion helps to improve affect recognition, indicating that body behaviors carry emotional information that is complementary rather than redundant to the emotion content in facial behaviors; and (4) it is possible to predict the context, i.e., whether a person is alone or in-a-group, using their non-verbal behavioral cues.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Hong:2019:ASS, author = "Richang Hong", title = "Advanced Stereo Seam Carving by Considering Occlusions on Both Sides", journal = j-TOMM, volume = "15", number = "3", pages = "69:1--69:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321513", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321513", abstract = "Stereo image retargeting plays a significant role in the field of image processing, which aims at making major objects as prominent as possible when the resolution of an image is changed, including maintaining disparity and depth information at the same time. Some seam carving methods are proposed to preserve the geometric consistency of the images. However, the regions of occlusion on both sides are not considered properly. In this article, we propose a solution to solve this problem. A new strategy of seams finding is designed by considering occluded and occluding regions on both of the input images, and leaving geometric consistency in both images intact. We also introduced the method of line segment detection and superpixel segmentation to further improve the quality of the images. Imaging effects are optimized in the process and visual comfort, which is also influenced by other factors, can be boosted as well.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2019:SET, author = "Yun Zhang and Na Li and Sam Kwong and Gangyi Jiang and Huanqiang Zeng", title = "Statistical Early Termination and Early Skip Models for Fast Mode Decision in {HEVC INTRA} Coding", journal = j-TOMM, volume = "15", number = "3", pages = "70:1--70:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321510", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321510", abstract = "In this article, statistical Early Termination (ET) and Early Skip (ES) models are proposed for fast Coding Unit (CU) and prediction mode decision in HEVC INTRA coding, in which three categories of ET and ES sub-algorithms are included. First, the CU ranges of the current CU are recursively predicted based on the texture and CU depth of the spatial neighboring CUs. Second, the statistical model based ET and ES schemes are proposed and applied to optimize the CU and INTRA prediction mode decision, in which the coding complexities over different decision layers are jointly minimized subject to acceptable rate-distortion degradation. Third, the mode correlations among the INTRA prediction modes are exploited to early terminate the full rate-distortion optimization in each CU decision layer. Extensive experiments are performed to evaluate the coding performance of each sub-algorithm and the overall algorithm. Experimental results reveal that the overall proposed algorithm can achieve 45.47\% to 74.77\%, and 58.09\% on average complexity reduction, while the overall Bj{\o}ntegaard delta bit rate increase and Bj{\o}ntegaard delta peak signal-to-noise ratio degradation are 2.29\% and -0.11 dB, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Gupta:2019:SGM, author = "Abhinav Gupta and Divya Singhal", title = "A Simplistic Global Median Filtering Forensics Based on Frequency Domain Analysis of Image Residuals", journal = j-TOMM, volume = "15", number = "3", pages = "71:1--71:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321508", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321508", abstract = "Sophisticated image forgeries introduce digital image forensics as an active area of research. In this area, many researchers have addressed the problem of median filtering forensics. Existing median filtering detectors are adequate to classify median filtered images in uncompressed mode and in compressed mode at high-quality factors. Despite that, the field is lacking a robust method to detect median filtering in low-resolution images compressed with low-quality factors. In this article, a novel feature set (four feature dimensions), based on first-order statistics of frequency contents of median filtered residuals (MFRs) of original and median filtered images, has been proposed. The proposed feature set outperforms handcrafted features-based state-of-the-art detectors in terms of feature set dimensions and detection results obtained for low-resolution images at all quality factors. Also, results reveal the efficacy of proposed method over deep-learning-based median filtering detector. Comprehensive results expose the efficacy of the proposed detector to detect median filtering against other similar manipulations. Additionally, generalization ability test on cross-database images support the cross-validation results on four different databases. Thus, our proposed detector meets the current challenges in the field, to a great extent.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2019:HVO, author = "Kan Wu and Guanbin Li and Haofeng Li and Jianjun Zhang and Yizhou Yu", title = "Harvesting Visual Objects from {Internet} Images via Deep-Learning-Based Objectness Assessment", journal = j-TOMM, volume = "15", number = "3", pages = "72:1--72:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3318463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3318463", abstract = "The collection of internet images has been growing in an astonishing speed. It is undoubted that these images contain rich visual information that can be useful in many applications, such as visual media creation and data-driven image synthesis. In this article, we focus on the methodologies for building a visual object database from a collection of internet images. Such database is built to contain a large number of high-quality visual objects that can help with various data-driven image applications. Our method is based on dense proposal generation and objectness-based re-ranking. A novel deep convolutional neural network is designed for the inference of proposal objectness, the probability of a proposal containing optimally located foreground object. In our work, the objectness is quantitatively measured in regard of completeness and fullness, reflecting two complementary features of an optimal proposal: a complete foreground and relatively small background. Our experiments indicate that object proposals re-ranked according to the output of our network generally achieve higher performance than those produced by other state-of-the-art methods. As a concrete example, a database of over 1.2 million visual objects has been built using the proposed method, and has been successfully used in various data-driven image applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yuan:2019:SSP, author = "Yuan Yuan and Jie Fang and Xiaoqiang Lu and Yachuang Feng", title = "Spatial Structure Preserving Feature Pyramid Network for Semantic Image Segmentation", journal = j-TOMM, volume = "15", number = "3", pages = "73:1--73:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321512", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321512", abstract = "Recently, progress on semantic image segmentation is substantial, benefiting from the rapid development of Convolutional Neural Networks. Semantic image segmentation approaches proposed lately have been mostly based on Fully convolutional Networks (FCNs). However, these FCN-based methods use large receptive fields and too many pooling layers to depict the discriminative semantic information of the images. Specifically, on one hand, convolutional kernel with large receptive field smooth the detailed edges, since too much contexture information is used to depict the ``center pixel.'' However, the pooling layer increases the receptive field through zooming out the latest feature maps, which loses many detailed information of the image, especially in the deeper layers of the network. These operations often cause low spatial resolution inside deep layers, which leads to spatially fragmented prediction. To address this problem, we exploit the inherent multi-scale and pyramidal hierarchy of deep convolutional networks to extract the feature maps with different resolutions and take full advantages of these feature maps via a gradually stacked fusing way. Specifically, for two adjacent convolutional layers, we upsample the features from deeper layer with stride of 2 and then stack them on the features from shallower layer. Then, a convolutional layer with kernels of 1$ \times $ 1 is followed to fuse these stacked features. The fused feature preserves the spatial structure information of the image; meanwhile, it owns strong discriminative capability for pixel classification. Additionally, to further preserve the spatial structure information and regional connectivity of the predicted category label map, we propose a novel loss term for the network. In detail, two graph model-based spatial affinity matrixes are proposed, which are used to depict the pixel-level relationships in the input image and predicted category label map respectively, and then their cosine distance is backward propagated to the network. The proposed architecture, called spatial structure preserving feature pyramid network, significantly improves the spatial resolution of the predicted category label map for semantic image segmentation. The proposed method achieves state-of-the-art results on three public and challenging datasets for semantic image segmentation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhang:2019:MFA, author = "Junxuan Zhang and Haifeng Hu and Xinlong Lu", title = "Moving Foreground-Aware Visual Attention and Key Volume Mining for Human Action Recognition", journal = j-TOMM, volume = "15", number = "3", pages = "74:1--74:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321511", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321511", abstract = "Recently, many deep learning approaches have shown remarkable progress on human action recognition. However, it remains unclear how to extract the useful information in videos since only video-level labels are available in the training phase. To address this limitation, many efforts have been made to improve the performance of action recognition by applying the visual attention mechanism in the deep learning model. In this article, we propose a novel deep model called Moving Foreground Attention (MFA) that enhances the performance of action recognition by guiding the model to focus on the discriminative foreground targets. In our work, MFA detects the moving foreground through a proposed variance-based algorithm. Meanwhile, an unsupervised proposal is utilized to mine the action-related key volumes and generate corresponding correlation scores. Based on these scores, a newly proposed stochastic-out scheme is exploited to train the MFA. Experiment results show that action recognition performance can be significantly improved by using our proposed techniques, and our model achieves state-of-the-art performance on UCF101 and HMDB51.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{More:2019:PLA, author = "Amit More and Subhasis Chaudhuri", title = "A Pseudo-likelihood Approach for Geo-localization of Events from Crowd-sourced Sensor-Metadata", journal = j-TOMM, volume = "15", number = "3", pages = "75:1--75:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321701", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321701", abstract = "Events such as live concerts, protest marches, and exhibitions are often video recorded by many people at the same time, typically using smartphone devices. In this work, we address the problem of geo-localizing such events from crowd-generated data. Traditional approaches for solving such a problem using multiple video sequences of the event would require highly complex computer vision (CV) methods, which are computation intensive and are not robust under the environment where visual data are collected through crowd-sourced medium. In the present work, we approach the problem in a probabilistic framework using only the sensor metadata obtained from smartphones. We model the event location and camera locations and orientations (camera parameters) as the hidden states in a Hidden Markov Model. The sensor metadata from GPS and the digital compass from user smartphones are used as the observations associated with the hidden states of the model. We have used a suitable potential function to capture the complex interaction between the hidden states (i.e., event location and camera parameters). The non-Gaussian densities involved in the model, such as the potential function involving hidden states, make the maximum-likelihood estimation intractable. We propose a pseudo-likelihood-based approach to maximize the approximate-likelihood, which provides a tractable solution to the problem. The experimental results on the simulated as well as real data show correct event geo-localization using the proposed method. When compared with several baselines the proposed method shows a superior performance. The overall computation time required is much smaller, since only the sensor metadata are used instead of visual data.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shah:2019:PCB, author = "Mohsin Shah and Weiming Zhang and Honggang Hu and Nenghai Yu", title = "{Paillier} Cryptosystem based Mean Value Computation for Encrypted Domain Image Processing Operations", journal = j-TOMM, volume = "15", number = "3", pages = "76:1--76:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3325194", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3325194", abstract = "Due to its large storage facility and high-end computing capability, cloud computing has received great attention as a huge amount of personal multimedia data and computationally expensive tasks can be outsourced to the cloud. However, the cloud being third-party semi-trusted, is prone to information leakage, raising privacy risks. Signal processing in the encrypted domain has emerged as a new research paradigm on privacy-preserving processing over outsourced data by semi-trusted cloud. In this article, we propose a solution for non-integer mean value computation in the homomorphic encrypted domain without any interactive protocol between the client and the service provider. Using the proposed solution, various image processing operations, such as local smoothing filter, un-sharp masking, and histogram equalization, can be performed in the encrypted domain at the cloud server without any privacy concerns. Our experimental results from standard test images reveal that these image processing operations can be performed without pre-processing, without client-server interactive protocol, and without any error between the encrypted domain and the plain domain.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yue:2019:SRS, author = "Guanghui Yue and Chunping Hou and Tianwei Zhou", title = "Subtitle Region Selection of {S$3$D} Images in Consideration of Visual Discomfort and Viewing Habit", journal = j-TOMM, volume = "15", number = "3", pages = "77:1--77:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3325197", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3325197", abstract = "Subtitles, serving as a linguistic approximation of the visual content, are an essential element in stereoscopic advertisement and the film industry. Due to the vergence accommodation conflict, the stereoscopic 3D (S3D) subtitle inevitably causes visual discomfort. To meet the viewing experience, the subtitle region should be carefully arranged. Unfortunately, very few works have been dedicated to this area. In this article, we propose a method for S3D subtitle region selection in consideration of visual discomfort and viewing habit. First, we divide the disparity map into multiple depth layers according to the disparity value. The preferential processed depth layer is determined by considering the disparity value of the foremost object. Second, the optimal region and coarse disparity value for S3D subtitle insertion are chosen by convolving the selective depth layer with the mean filter. Specifically, the viewing habit is considered during the region selection. Finally, after region selection, the disparity value of the subtitle is further modified by using the just noticeable depth difference (JNDD) model. Given that there is no public database reported for the evaluation of S3D subtitle insertion, we collect 120 S3D images as the test platform. Both objective and subjective experiments are conducted to evaluate the comfort degree of the inserted subtitle. Experimental results demonstrate that the proposed method can obtain promising performance in improving the viewing experience of the inserted subtitle.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2019:LCB, author = "Yehao Li and Yingwei Pan and Ting Yao and Hongyang Chao and Yong Rui and Tao Mei", title = "Learning Click-Based Deep Structure-Preserving Embeddings with Visual Attention", journal = j-TOMM, volume = "15", number = "3", pages = "78:1--78:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328994", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3328994", abstract = "One fundamental problem in image search is to learn the ranking functions (i.e., the similarity between query and image). Recent progress on this topic has evolved through two paradigms: the text-based model and image ranker learning. The former relies on image surrounding texts, making the similarity sensitive to the quality of textual descriptions. The latter may suffer from the robustness problem when human-labeled query-image pairs cannot represent user search intent precisely. We demonstrate in this article that the preceding two limitations can be well mitigated by learning a cross-view embedding that leverages click data. Specifically, a novel click-based Deep Structure-Preserving Embeddings with visual Attention (DSPEA) model is presented, which consists of two components: deep convolutional neural networks followed by image embedding layers for learning visual embedding, and a deep neural networks for generating query semantic embedding. Meanwhile, visual attention is incorporated at the top of the convolutional neural network to reflect the relevant regions of the image to the query. Furthermore, considering the high dimension of the query space, a new click-based representation on a query set is proposed for alleviating this sparsity problem. The whole network is end-to-end trained by optimizing a large margin objective that combines cross-view ranking constraints with in-view neighborhood structure preservation constraints. On a large-scale click-based image dataset with 11.7 million queries and 1 million images, our model is shown to be powerful for keyword-based image search with superior performance over several state-of-the-art methods and achieves, to date, the best reported NDCG@25 of 52.21\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Cao:2019:SOG, author = "Tengfei Cao and Changqiao Xu and Mu Wang and Zhongbai Jiang and Xingyan Chen and Lujie Zhong and Luigi Alfredo Grieco", title = "Stochastic Optimization for Green Multimedia Services in Dense {$5$G} Networks", journal = j-TOMM, volume = "15", number = "3", pages = "79:1--79:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328996", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3328996", abstract = "The manyfold capacity magnification promised by dense 5G networks will make possible the provisioning of broadband multimedia services, including virtual reality, augmented reality, and mobile immersive video, to name a few. These new applications will coexist with classic ones and contribute to the exponential growth of multimedia services in mobile networks. At the same time, the different requirements of past and old services pose new challenges to the effective usage of 5G resources. In response to these challenges, a novel Stochastic Optimization framework for Green Multimedia Services named SOGMS is proposed herein that targets the maximization of system throughput and the minimization of energy consumption in data delivery. In particular, Lyapunov optimization is leveraged to face this optimization objective, which is formulated and decomposed into three tractable subproblems. For each subproblem, a distinct algorithm is conceived, namely quality of experience--based admission control, cooperative resource allocation, and multimedia services scheduling. Finally, extensive simulations are carried out to evaluate the proposed method against state-of-art solutions in dense 5G networks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wu:2019:PAT, author = "Jie Wu and Haifeng Hu and Liang Yang", title = "Pseudo-{$3$D} Attention Transfer Network with Content-aware Strategy for Image Captioning", journal = j-TOMM, volume = "15", number = "3", pages = "80:1--80:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3336495", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3336495", abstract = "In this article, we propose a novel Pseudo-3D Attention Transfer network with Content-aware Strategy (P3DAT-CAS) for the image captioning task. Our model is composed of three parts: the Pseudo-3D Attention (P3DA) network, the P3DA-based Transfer (P3DAT) network, and the Content-aware Strategy (CAS). First, we propose P3DA to take full advantage of three-dimensional (3D) information in convolutional feature maps and capture more details. Most existing attention-based models only extract the 2D spatial representation from convolutional feature maps to decide which area should be paid more attention to. However, convolutional feature maps are 3D and different channel features can detect diverse semantic attributes associated with images. P3DA is proposed to combine 2D spatial maps with 1D semantic-channel attributes and generate more informative captions. Second, we design the transfer network to maintain and transfer the key previous attention information. The traditional attention-based approaches only utilize the current attention information to predict words directly, whereas transfer network is able to learn long-term attention dependencies and explore global modeling pattern. Finally, we present CAS to provide a more relevant and distinct caption for each image. The captioning model trained by maximum likelihood estimation may generate the captions that have a weak correlation with image contents, resulting in the cross-modal gap between vision and linguistics. However, CAS is helpful to convey the meaningful visual contents accurately. P3DAT-CAS is evaluated on Flickr30k and MSCOCO, and it achieves very competitive performance among the state-of-the-art models.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2019:DSS, author = "Min Wang and Wengang Zhou and Qi Tian and Houqiang Li", title = "Deep Scalable Supervised Quantization by Self-Organizing Map", journal = j-TOMM, volume = "15", number = "3", pages = "81:1--81:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328995", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3328995", abstract = "Approximate Nearest Neighbor (ANN) search is an important research topic in multimedia and computer vision fields. In this article, we propose a new deep supervised quantization method by Self-Organizing Map to address this problem. Our method integrates the Convolutional Neural Networks and Self-Organizing Map into a unified deep architecture. The overall training objective optimizes supervised quantization loss as well as classification loss. With the supervised quantization objective, we minimize the differences on the maps between similar image pairs and maximize the differences on the maps between dissimilar image pairs. By optimization, the deep architecture can simultaneously extract deep features and quantize the features into suitable nodes in self-organizing map. To make the proposed deep supervised quantization method scalable for large datasets, instead of constructing a larger self-organizing map, we propose to divide the input space into several subspaces and construct self-organizing map in each subspace. The self-organizing maps in all the subspaces implicitly construct a large self-organizing map, which costs less memory and training time than directly constructing a self-organizing map with equal size. The experiments on several public standard datasets prove the superiority of our approaches over the existing ANN search methods. Besides, as a by-product, our deep architecture can be directly applied to visualization with little modification, and promising performance is demonstrated in the experiments.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ozcelik:2019:CDA, author = "Ihsan Mert Ozcelik and Cem Ersoy", title = "Chunk Duration-Aware {SDN}-Assisted {DASH}", journal = j-TOMM, volume = "15", number = "3", pages = "82:1--82:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3337681", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3337681", abstract = "Although Dynamic Adaptive Streaming over HTTP (DASH) is the pillar of multimedia content delivery mechanisms, its purely client-based adaptive video bitrate mechanisms have quality-of-experience fairness and stability problems in the existence of multiple DASH clients and highly fluctuating background traffic on the same shared bottleneck link. Varying chunk duration among different titles of multiple video providers exacerbates this problem. With the help of the global network view provided by the software-defined networking paradigm, we propose a centralized joint optimization module-assisted adaptive video bitrate mechanism that takes diversity of chunk sizes among different content into account. Our system collects possible video bitrate levels and chunk duration from DASH clients and simply calculates the optimal video bitrates per client based on the available capacity and chunk duration of each client's selected content while not invading users' privacy. By continuously following the background traffic flows, it asynchronously updates the target video bitrate levels to avoid both buffer stall events and network underutilization issues rather than bandwidth slicing, which brings about scalability problems in practice. It also guarantees fair startup delays for video sessions with various chunk duration. Our experiments clearly show that our proposed approach considering diversity of chunk duration and that background traffic fluctuations can significantly provide a better and fair quality of experience in terms of structural similarity--based video quality and startup delay compared to both purely client-based and state-of-the-art software-defined networking--based adaptive bitrate mechanisms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhuang:2019:RCI, author = "Naifan Zhuang and Guo-Jun Qi and The Duc Kieu and Kien A. Hua", title = "Rethinking the Combined and Individual Orders of Derivative of States for Differential Recurrent Neural Networks: Deep Differential Recurrent Neural Networks", journal = j-TOMM, volume = "15", number = "3", pages = "83:1--83:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3337928", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3337928", abstract = "Due to their special gating schemes, Long Short-Term Memory (LSTM) has shown greater potential to process complex sequential information than the traditional Recurrent Neural Network (RNN). The conventional LSTM, however, fails to take into consideration the impact of salient spatio-temporal dynamics present in the sequential input data. This problem was first addressed by the differential Recurrent Neural Network (dRNN), which uses a differential gating scheme known as Derivative of States (DoS). DoS uses higher orders of internal state derivatives to analyze the change in information gain originated from the salient motions between the successive frames. The weighted combination of several orders of DoS is then used to modulate the gates in dRNN. While each individual order of DoS is good at modeling a certain level of salient spatio-temporal sequences, the sum of all the orders of DoS could distort the detected motion patterns. To address this problem, we propose to control the LSTM gates via individual orders of DoS. To fully utilize the different orders of DoS, we further propose to stack multiple levels of LSTM cells in an increasing order of state derivatives. The proposed model progressively builds up the ability of the LSTM gates to detect salient dynamical patterns in deeper stacked layers modeling higher orders of DoS; thus, the proposed LSTM model is termed deep differential Recurrent Neural Network (d$^2$ RNN). The effectiveness of the proposed model is demonstrated on three publicly available human activity datasets: NUS-HGA, Violent-Flows, and UCF101. The proposed model outperforms both LSTM and non-LSTM based state-of-the-art algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Wang:2019:EBD, author = "Zhangcheng Wang and Ya Li and Richang Hong and Xinmei Tian", title = "Eigenvector-Based Distance Metric Learning for Image Classification and Retrieval", journal = j-TOMM, volume = "15", number = "3", pages = "84:1--84:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3340262", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Oct 2 10:12:47 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3340262", abstract = "Distance metric learning has been widely studied in multifarious research fields. The mainstream approaches learn a Mahalanobis metric or learn a linear transformation. Recent related works propose learning a linear combination of base vectors to approximate the metric. In this way, fewer variables need to be determined, which is efficient when facing high-dimensional data. Nevertheless, such works obtain base vectors using additional data from related domains or randomly generate base vectors. However, obtaining base vectors from related domains requires extra time and additional data, and random vectors introduce randomness into the learning process, which requires sufficient random vectors to ensure the stability of the algorithm. Moreover, the random vectors cannot capture the rich information of the training data, leading to a degradation in performance. Considering these drawbacks, we propose a novel distance metric learning approach by introducing base vectors explicitly learned from training data. Given a specific task, we can make a sparse approximation of its objective function using the top eigenvalues and corresponding eigenvectors of a predefined integral operator on the reproducing kernel Hilbert space. Because the process of generating eigenvectors simply refers to the training data of the considered task, our proposed method does not require additional data and can reflect the intrinsic information of the input features. Furthermore, the explicitly learned eigenvectors do not result in randomness, and we can extend our method to any kernel space without changing the objective function. We only need to learn the coefficients of these eigenvectors, and the only hyperparameter that we need to determine is the number of eigenvectors that we utilize. Additionally, an optimization algorithm is proposed to efficiently solve this problem. Extensive experiments conducted on several datasets demonstrate the effectiveness of our proposed method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Pala:2020:ISI, author = "Pietro Pala and Liming Chen and Di Huang and Xiaoming Liu and Stefanos Zafeiriou", title = "Introduction to the Special Issue on Face Analysis Applications", journal = j-TOMM, volume = "15", number = "3s", pages = "1--2", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3359624", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3359624", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2020:UTB, author = "Zhen-Hua Feng and Josef Kittler and Bill Christmas and Xiao-Jun Wu", title = "A Unified Tensor-based Active Appearance Model", journal = j-TOMM, volume = "15", number = "3s", pages = "1--22", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3338841", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3338841", abstract = "Appearance variations result in many difficulties in face image analysis. To deal with this challenge, we present a Unified Tensor-based Active Appearance Model (UT-AAM) for jointly modelling the geometry and texture information of 2D faces. For each \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shamai:2020:SFP, author = "Gil Shamai and Ron Slossberg and Ron Kimmel", title = "Synthesizing Facial Photometries and Corresponding Geometries Using Generative Adversarial Networks", journal = j-TOMM, volume = "15", number = "3s", pages = "1--24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3337067", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3337067", abstract = "Artificial data synthesis is currently a well-studied topic with useful applications in data science, computer vision, graphics, and many other fields. Generating realistic data is especially challenging, since human perception is highly sensitive to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:UNC, author = "Xueping Wang and Yunhong Wang and Weixin Li", title = "{U-Net} Conditional {GANs} for Photo-Realistic and Identity-Preserving Facial Expression Synthesis", journal = j-TOMM, volume = "15", number = "3s", pages = "1--23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355397", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3355397", abstract = "Facial expression synthesis (FES) is a challenging task since the expression changes are highly non-linear and depend on the facial appearance. Person identity should also be well preserved in the synthesized face. In this article, we present a novel U- \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2020:EFA, author = "Zhiwei Liu and Xiangyu Zhu and Ming Tang and Zhen Lei and Jinqiao Wang", title = "Efficient Face Alignment with Fast Normalization and Contour Fitting Loss", journal = j-TOMM, volume = "15", number = "3s", pages = "1--16", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3338842", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3338842", abstract = "Face alignment is a key component of numerous face analysis tasks. In recent years, most existing methods have focused on designing high-performance face alignment systems and paid less attention to efficiency. However more face alignment systems are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duan:2020:VAA, author = "Huiyu Duan and Xiongkuo Min and Yi Fang and Lei Fan and Xiaokang Yang and Guangtao Zhai", title = "Visual Attention Analysis and Prediction on Human Faces for Children with Autism Spectrum Disorder", journal = j-TOMM, volume = "15", number = "3s", pages = "1--23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3337066", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3337066", abstract = "The focus of this article is to analyze and predict the visual attention of children with Autism Spectrum Disorder (ASD) when looking at human faces. Social difficulties are the hallmark features of ASD and will lead to atypical visual attention toward \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duan:2020:FEM, author = "Mingxing Duan and Kenli Li and Xiangke Liao and Keqin Li and Qi Tian", title = "Features-Enhanced Multi-Attribute Estimation with Convolutional Tensor Correlation Fusion Network", journal = j-TOMM, volume = "15", number = "3s", pages = "1--23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355542", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3355542", abstract = "To achieve robust facial attribute estimation, a hierarchical prediction system referred to as tensor correlation fusion network (TCFN) is proposed for attribute estimation. The system includes feature extraction, correlation excavation among facial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2020:ISI, author = "Sicheng Zhao and Dhiraj Joshi and Mohammad Soleymani and Qiang Ji", title = "Introduction to the Special Issue on Affective Computing for Large-scale Heterogeneous Multimedia Data", journal = j-TOMM, volume = "15", number = "3s", pages = "1--2", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3365845", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3365845", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2020:ACL, author = "Sicheng Zhao and Shangfei Wang and Mohammad Soleymani and Dhiraj Joshi and Qiang Ji", title = "Affective Computing for Large-scale Heterogeneous Multimedia Data: a Survey", journal = j-TOMM, volume = "15", number = "3s", pages = "1--32", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3363560", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3363560", abstract = "The wide popularity of digital photography and social networks has generated a rapidly growing volume of multimedia data (i.e., images, music, and videos), resulting in a great demand for managing, retrieving, and understanding these data. Affective \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hong:2020:CSF, author = "Xiaopeng Hong and Wei Peng and Mehrtash Harandi and Ziheng Zhou and Matti Pietik{\"a}inen and Guoying Zhao", title = "Characterizing Subtle Facial Movements via {Riemannian} Manifold", journal = j-TOMM, volume = "15", number = "3s", pages = "1--24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3342227", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3342227", abstract = "Characterizing subtle facial movements from videos is one of the most intensive topics in computer vision research. It is, however, challenging, since (1) the intensity of subtle facial muscle movement is usually low, (2) the duration may be transient, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2020:PSB, author = "Junjie Zhu and Yuxuan Wei and Yifan Feng and Xibin Zhao and Yue Gao", title = "Physiological Signals-based Emotion Recognition via High-order Correlation Learning", journal = j-TOMM, volume = "15", number = "3s", pages = "1--18", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3332374", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3332374", abstract = "Emotion recognition by physiological signals is an effective way to discern the inner state of human beings and therefore has been widely adopted in many user-centered applications. The majority of current state-of-the-art methods focus on exploring \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{She:2020:LDS, author = "Dongyu She and Ming Sun and Jufeng Yang", title = "Learning Discriminative Sentiment Representation from Strongly- and Weakly Supervised {CNNs}", journal = j-TOMM, volume = "15", number = "3s", pages = "1--19", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3326335", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3326335", abstract = "Visual sentiment analysis is attracting increasing attention with the rapidly growing amount of images uploaded to social networks. Learning rich visual representations often requires training deep convolutional neural networks (CNNs) on massive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2020:HCR, author = "Liang Li and Xinge Zhu and Yiming Hao and Shuhui Wang and Xingyu Gao and Qingming Huang", title = "A Hierarchical {CNN-RNN} Approach for Visual Emotion Classification", journal = j-TOMM, volume = "15", number = "3s", pages = "1--17", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3359753", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3359753", abstract = "Visual emotion classification is predicting emotional reactions of people for the given visual content. Psychological studies show that human emotions are affected by various visual stimuli from low level to high level, including contrast, color, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2020:ASC, author = "Liang Yang and Yuexue Wang and Junhua Gu and Xiaochun Cao and Xiao Wang and Di Jin and Guiguang Ding and Jungong Han and Weixiong Zhang", title = "Autonomous Semantic Community Detection via Adaptively Weighted Low-rank Approximation", journal = j-TOMM, volume = "15", number = "3s", pages = "1--22", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355393", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3355393", abstract = "Identification of semantic community structures is important for understanding the interactions and sentiments of different groups of people and predicting the social emotion. A robust community detection method needs to autonomously determine the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hou:2020:SDE, author = "Yuxin Hou and Hongxun Yao and Xiaoshuai Sun and Haoran Li", title = "{Soul Dancer}: Emotion-Based Human Action Generation", journal = j-TOMM, volume = "15", number = "3s", pages = "1--19", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3340463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3340463", abstract = "Body language is one of the most common ways of expressing human emotion. In this article, we make the first attempt to generate an action video with a specific emotion from a single person image. The goal of the emotion-based action generation task \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2020:ACA, author = "Shenghong Hu and Min Xu and Haimin Zhang and Chunxia Xiao and Chao Gui", title = "Affective Content-aware Adaptation Scheme on {QoE} Optimization of Adaptive Streaming over {HTTP}", journal = j-TOMM, volume = "15", number = "3s", pages = "1--18", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3328997", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jan 23 07:04:18 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3328997", abstract = "The article presents a novel affective content-aware adaptation scheme (ACAA) to optimize Quality of Experience (QoE) for dynamic adaptive video streaming over HTTP (DASH). Most of the existing DASH adaptation schemes conduct video bit-rate adaptation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nie:2020:HHG, author = "Weizhi Nie and Weijie Wang and Anan Liu and Yuting Su and Jie Nie", title = "{HGAN}: Holistic Generative Adversarial Networks for Two-dimensional Image-based Three-dimensional Object Retrieval", journal = j-TOMM, volume = "15", number = "4", pages = "1--24", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3344684", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3344684", abstract = "In this article, we propose a novel method to address the two-dimensional (2D) image-based 3D object retrieval problem. First, we extract a set of virtual views to represent each 3D object. Then, a soft-attention model is utilized to find the weight of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Li:2020:IVR, author = "Mading Li and Jiaying Liu and Xiaoyan Sun and Zhiwei Xiong", title = "Image\slash Video Restoration via Multiplanar Autoregressive Model and Low-Rank Optimization", journal = j-TOMM, volume = "15", number = "4", pages = "1--23", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3341728", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3341728", abstract = "In this article, we introduce an image/video restoration approach by utilizing the high-dimensional similarity in images/videos. After grouping similar patches from neighboring frames, we propose to build a multiplanar autoregressive (AR) model to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zhong:2020:SDM, author = "Sheng-Hua Zhong and Yuantian Wang and Tongwei Ren and Mingjie Zheng and Yan Liu and Gangshan Wu", title = "Steganographer Detection via Multi-Scale Embedding Probability Estimation", journal = j-TOMM, volume = "15", number = "4", pages = "1--23", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3352691", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3352691", abstract = "Steganographer detection aims to identify the guilty user who utilizes steganographic methods to hide secret information in the spread of multimedia data, especially image data, from a large amount of innocent users on social networks. A true embedding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{AlvesdeAlmeida:2020:RPS, author = "Marcos {Alves de Almeida} and Carolina {Coimbra Vieira} and Pedro Olmo Stancioli {Vaz De Melo} and Renato {Martins Assun{\c{c}}{\~a}o}", title = "Random Playlists Smoothly Commuting Between Styles", journal = j-TOMM, volume = "15", number = "4", pages = "1--20", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3361742", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3361742", abstract = "Someone enjoys listening to playlists while commuting. He wants a different playlist of n songs each day, but always starting from Locked Out of Heaven, a Bruno Mars song. The list should progress in smooth transitions between successive and randomly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Ye:2020:SCM, author = "Zhaoda Ye and Yuxin Peng", title = "Sequential Cross-Modal Hashing Learning via Multi-scale Correlation Mining", journal = j-TOMM, volume = "15", number = "4", pages = "1--20", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3356338", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3356338", abstract = "Cross-modal hashing aims to map heterogeneous multimedia data into a common Hamming space through hash function, and achieves fast and flexible cross-modal retrieval. Most existing cross-modal hashing methods learn hash function by mining the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "105", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2020:EIH, author = "Shiguang Liu and Ziqing Huang", title = "Efficient Image Hashing with Geometric Invariant Vector Distance for Copy Detection", journal = j-TOMM, volume = "15", number = "4", pages = "1--22", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3355394", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3355394", abstract = "Hashing method is an efficient technique of multimedia security for content protection. It maps an image into a content-based compact code for denoting the image itself. While most existing algorithms focus on improving the classification between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2020:LAB, author = "Zhandong Liu and Wengang Zhou and Houqiang Li", title = "{AB-LSTM}: Attention-based Bidirectional {LSTM} Model for Scene Text Detection", journal = j-TOMM, volume = "15", number = "4", pages = "1--23", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3356728", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3356728", abstract = "Detection of scene text in arbitrary shapes is a challenging task in the field of computer vision. Most existing scene text detection methods exploit the rectangle/quadrangular bounding box to denote the detected text, which fails to accurately fit text \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Bhowmik:2020:EDA, author = "Deepayan Bhowmik and Charith Abhayaratne", title = "Embedding Distortion Analysis in Wavelet-domain Watermarking", journal = j-TOMM, volume = "15", number = "4", pages = "1--24", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3357333", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357333", abstract = "Imperceptibility and robustness are two complementary fundamental requirements of any watermarking algorithm. Low-strength watermarking yields high imperceptibility, but exhibits poor robustness. High-strength watermarking schemes achieve good \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shen:2020:VRS, author = "Ling Shen and Richang Hong and Haoran Zhang and Xinmei Tian and Meng Wang", title = "Video Retrieval with Similarity-Preserving Deep Temporal Hashing", journal = j-TOMM, volume = "15", number = "4", pages = "1--16", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3356316", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3356316", abstract = "Despite the fact that remarkable progress has been made in recent years, Content-based Video Retrieval (CBVR) is still an appealing research topic due to increasing search demands in the Internet era of big data. This article aims to explore an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{VanderHooft:2020:TBA, author = "Jeroen {Van der Hooft} and Maria {Torres Vega} and Stefano Petrangeli and Tim Wauters and Filip {De Turck}", title = "Tile-based Adaptive Streaming for Virtual Reality Video", journal = j-TOMM, volume = "15", number = "4", pages = "1--24", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3362101", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3362101", abstract = "The increasing popularity of head-mounted devices and 360${}^\circ $ video cameras allows content providers to provide virtual reality (VR) video streaming over the Internet, using a two-dimensional representation of the immersive content combined with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Filho:2020:DPV, author = "Roberto Iraja {Tavares Da Costa Filho} and Marcelo {Caggiani Luizelli} and Stefano Petrangeli and Maria {Torres Vega} and Jeroen {Van der Hooft} and Tim Wauters and Filip {De Turck} and Luciano {Paschoal Gaspary}", title = "Dissecting the Performance of {VR} Video Streaming through the {VR-EXP} Experimentation Platform", journal = j-TOMM, volume = "15", number = "4", pages = "1--23", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3360286", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3360286", abstract = "To cope with the massive bandwidth demands of Virtual Reality (VR) video streaming, both the scientific community and the industry have been proposing optimization techniques such as viewport-aware streaming and tile-based adaptive bitrate heuristics. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Zheng:2020:ULH, author = "Yunpeng Zheng and Xuelong Li and Xiaoqiang Lu", title = "Unsupervised Learning of Human Action Categories in Still Images with Deep Representations", journal = j-TOMM, volume = "15", number = "4", pages = "1--20", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3362161", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3362161", abstract = "In this article, we propose a novel method for unsupervised learning of human action categories in still images. In contrast to previous methods, the proposed method explores distinctive information of actions directly from unlabeled image databases, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Xing:2020:ICC, author = "Meng Xing and Zhiyong Feng and Yong Su and Jianhai Zhang", title = "An Image Cues Coding Approach for {$3$D} Human Pose Estimation", journal = j-TOMM, volume = "15", number = "4", pages = "1--20", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3368066", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3368066", abstract = "Although Deep Convolutional Neural Networks (DCNNs) facilitate the evolution of 3D human pose estimation, ambiguity remains the most challenging problem in such tasks. Inspired by the Human Perception Mechanism (HPM), we propose an image-to-pose coding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Liu:2020:EEA, author = "Jinhuan Liu and Xuemeng Song and Liqiang Nie and Tian Gan and Jun Ma", title = "An End-to-End Attention-Based Neural Model for Complementary Clothing Matching", journal = j-TOMM, volume = "15", number = "4", pages = "1--16", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3368071", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3368071", abstract = "In modern society, people tend to prefer fashionable and decent outfits that can meet more than basic physiological needs. In fact, a proper outfit usually relies on good matching among complementary fashion items (e.g., the top, bottom, and shoes) that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Kua:2020:ACA, author = "Jonathan Kua and Grenville Armitage and Philip Branch and Jason But", title = "Adaptive Chunklets and {AQM} for Higher-Performance Content Streaming", journal = j-TOMM, volume = "15", number = "4", pages = "1--24", month = jan, year = "2020", DOI = "https://doi.org/10.1145/3344381", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 11 08:35:19 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3344381", abstract = "Commercial streaming services such as Netflix and YouTube use proprietary HTTP-based adaptive streaming (HAS) techniques to deliver content to consumers worldwide. MPEG recently developed Dynamic Adaptive Streaming over HTTP (DASH) as a unifying \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Chen:2020:LLF, author = "Bin Chen and Lingyan Ruan and Miu-Ling Lam", title = "{LFGAN}: {$4$D} Light Field Synthesis from a Single {RGB} Image", journal = j-TOMM, volume = "16", number = "1", pages = "2:1--2:20", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3366371", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3366371", abstract = "We present a deep neural network called the light field generative adversarial network (LFGAN) that synthesizes a 4D light field from a single 2D RGB image. We generate light fields using a single image super-resolution (SISR) technique based on two \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ding:2020:AEU, author = "Yuhang Ding and Hehe Fan and Mingliang Xu and Yi Yang", title = "Adaptive Exploration for Unsupervised Person Re-identification", journal = j-TOMM, volume = "16", number = "1", pages = "3:1--3:19", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369393", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369393", abstract = "Due to domain bias, directly deploying a deep person re-identification (re-ID) model trained on one dataset often achieves considerably poor accuracy on another dataset. In this article, we propose an Adaptive Exploration (AE) method to address the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bentaleb:2020:DDQ, author = "Abdelhak Bentaleb and Praveen Kumar Yadav and Wei Tsang Ooi and Roger Zimmermann", title = "{DQ-DASH}: a Queuing Theory Approach to Distributed Adaptive Video Streaming", journal = j-TOMM, volume = "16", number = "1", pages = "4:1--4:24", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371040", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371040", abstract = "The significant popularity of HTTP adaptive video streaming (HAS), such as Dynamic Adaptive Streaming over HTTP (DASH), over the Internet has led to a stark increase in user expectations in terms of video quality and delivery robustness. This situation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2020:RHR, author = "Xin Huang and Yuxin Peng and Zhang Wen", title = "{RCE-HIL}: Recognizing Cross-media Entailment with Heterogeneous Interactive Learning", journal = j-TOMM, volume = "16", number = "1", pages = "5:1--5:21", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3365003", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3365003", abstract = "Entailment recognition is an important paradigm of reasoning that judges if a hypothesis can be inferred from given premises. However, previous efforts mainly concentrate on text-based reasoning as recognizing textual entailment (RTE), where the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2020:CRT, author = "Miaopeng Li and Zimeng Zhou and Xinguo Liu", title = "Cross Refinement Techniques for Markerless Human Motion Capture", journal = j-TOMM, volume = "16", number = "1", pages = "6:1--6:18", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372207", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372207", abstract = "This article presents a global 3D human pose estimation method for markerless motion capture. Given two calibrated images of a person, it first obtains the 2D joint locations in the images using a pre-trained 2D Pose CNN, then constructs the 3D pose \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Illahi:2020:CGF, author = "Gazi Karam Illahi and Thomas {Van Gemert} and Matti Siekkinen and Enrico Masala and Antti Oulasvirta and Antti Yl{\"a}-J{\"a}{\"a}ski", title = "Cloud Gaming with Foveated Video Encoding", journal = j-TOMM, volume = "16", number = "1", pages = "7:1--7:24", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369110", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369110", abstract = "Cloud gaming enables playing high-end games, originally designed for PC or game console setups, on low-end devices such as netbooks and smartphones, by offloading graphics rendering to GPU-powered cloud servers. However, transmitting the high-resolution \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nguyen:2020:ETS, author = "Duc V. Nguyen and Huyen T. T. Tran and Truong Cong Thang", title = "An Evaluation of Tile Selection Methods for Viewport-Adaptive Streaming of 360-Degree Video", journal = j-TOMM, volume = "16", number = "1", pages = "8:1--8:24", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3373359", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3373359", abstract = "360-degree video has become increasingly popular nowadays. For effective transmission of bandwidth-intensive 360-degree video over networks, viewport-adaptive streaming has been introduced. In this article, we evaluate, for the first time, ten existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2020:LSS, author = "Zhenguo Yang and Zehang Lin and Peipei Kang and Jianming Lv and Qing Li and Wenyin Liu", title = "Learning Shared Semantic Space with Correlation Alignment for Cross-Modal Event Retrieval", journal = j-TOMM, volume = "16", number = "1", pages = "9:1--9:22", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3374754", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3374754", abstract = "In this article, we propose to learn shared semantic space with correlation alignment ( S$^3$ CA ) for multimodal data representations, which aligns nonlinear correlations of multimodal data distributions in deep neural networks designed for heterogeneous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:JSH, author = "Junfeng Zhang and Haifeng Hu and Guobin Shen", title = "Joint Stacked Hourglass Network and Salient Region Attention Refinement for Robust Face Alignment", journal = j-TOMM, volume = "16", number = "1", pages = "10:1--10:18", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3374760", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3374760", abstract = "Facial landmark detection aims to locate keypoints for facial images, which typically suffer from variations caused by arbitrary pose, diverse facial expressions, and partial occlusion. In this article, we propose a coarse-to-fine framework that joins a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tasaka:2020:CSM, author = "Shuji Tasaka", title = "Causal Structures of Multidimensional {QoE} in Haptic-Audiovisual Communications: {Bayesian} Modeling", journal = j-TOMM, volume = "16", number = "1", pages = "11:1--11:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3375922", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3375922", abstract = "This article proposes a methodology for building and verifying plausible models that can express causation in multidimensional QoE for haptic-audiovisual interactive communications. For the modeling, we utilize subjective experimental data of five-point \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Punn:2020:IUN, author = "Narinder Singh Punn and Sonali Agarwal", title = "Inception {U-Net} Architecture for Semantic Segmentation to Identify Nuclei in Microscopy Cell Images", journal = j-TOMM, volume = "16", number = "1", pages = "12:1--12:15", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3376922", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3376922", abstract = "With the increasing applications of deep learning in biomedical image analysis, in this article we introduce an inception U-Net architecture for automating nuclei detection in microscopy cell images of varying size and modality to help unlock faster \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chaudhary:2020:IRC, author = "Chandramani Chaudhary and Poonam Goyal and Navneet Goyal and Yi-Ping Phoebe Chen", title = "Image Retrieval for Complex Queries Using Knowledge Embedding", journal = j-TOMM, volume = "16", number = "1", pages = "13:1--13:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3375786", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3375786", abstract = "With the increase in popularity of image-based applications, users are retrieving images using more sophisticated and complex queries. We present three types of complex queries, namely, long, ambiguous, and abstract. Each type of query has its own \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Luo:2020:STS, author = "Guoliang Luo and Zhigang Deng and Xin Zhao and Xiaogang Jin and Wei Zeng and Wenqiang Xie and Hyewon Seo", title = "Spatio-temporal Segmentation Based Adaptive Compression of Dynamic Mesh Sequences", journal = j-TOMM, volume = "16", number = "1", pages = "14:1--14:24", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377475", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377475", abstract = "With the recent advances in data acquisition techniques, the compression of various dynamic mesh sequence data has become an important topic in the computer graphics community. In this article, we present a new spatio-temporal segmentation-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2020:FLB, author = "Zhaoqing Pan and Xiaokai Yi and Yun Zhang and Hui Yuan and Fu Lee Wang and Sam Kwong", title = "Frame-level Bit Allocation Optimization Based on Video Content Characteristics for {HEVC}", journal = j-TOMM, volume = "16", number = "1", pages = "15:1--15:20", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3380827", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3380827", abstract = "Rate control plays an important role in high efficiency video coding (HEVC), and bit allocation is the foundation of rate control. The video content characteristics are significant for bit allocation, and modeling an accurate relationship between video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ainam:2020:EAF, author = "Jean-Paul Ainam and Ke Qin and Guisong Liu and Guangchun Luo and Brighter Agyemang", title = "Enforcing Affinity Feature Learning through Self-attention for Person Re-identification", journal = j-TOMM, volume = "16", number = "1", pages = "16:1--16:22", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377352", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377352", abstract = "Person re-identification is the task of recognizing an individual across heterogeneous non-overlapping camera views. It has become a crucial capability needed by many applications in public space video surveillance. However, it remains a challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2020:DLA, author = "Mengyan Li and Zhaoyu Zhang and Guochen Xie and Jun Yu", title = "A Deep Learning Approach for Face Hallucination Guided by Facial Boundary Responses", journal = j-TOMM, volume = "16", number = "1", pages = "17:1--17:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377874", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377874", abstract = "Face hallucination is a domain-specific super-resolution (SR) problem of learning a mapping between a low-resolution (LR) face image and its corresponding high-resolution (HR) image. Tremendous progress on deep learning has shown exciting potential for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2020:EDL, author = "Zan Gao and Yinming Li and Shaohua Wan", title = "Exploring Deep Learning for View-Based {$3$D} Model Retrieval", journal = j-TOMM, volume = "16", number = "1", pages = "18:1--18:21", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377876", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 6 09:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377876", abstract = "In recent years, view-based 3D model retrieval has become one of the research focuses in the field of computer vision and machine learning. In fact, the 3D model retrieval algorithm consists of feature extraction and similarity measurement, and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:ISI, author = "Shengping Zhang and Huiyu Zhou and Dong Xu and M. Emre Celebi and Thierry Bouwmans", title = "Introduction to the Special Issue on Multimodal Machine Learning for Human Behavior Analysis", journal = j-TOMM, volume = "16", number = "1s", pages = "19:1--19:2", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3381917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3381917", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2020:RVT, author = "Changyong Guo and Zhaoxin Zhang and Jinjiang Li and Xuesong Jiang and Jun Zhang and Lei Zhang", title = "Robust Visual Tracking Using Kernel Sparse Coding on Multiple Covariance Descriptors", journal = j-TOMM, volume = "16", number = "1s", pages = "20:1--20:22", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3360308", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3360308", abstract = "In this article, we aim to improve the performance of visual tracking by combing different features of multiple modalities. The core idea is to use covariance matrices as feature descriptors and then use sparse coding to encode different features. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:CSO, author = "Zhaoxin Zhang and Changyong Guo and Fanzhi Meng and Taizhong Xu and Junkai Huang", title = "{CovLets}: a Second-Order Descriptor for Modeling Multiple Features", journal = j-TOMM, volume = "16", number = "1s", pages = "21:1--21:14", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357525", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357525", abstract = "State-of-the-art techniques for image and video classification take a bottom-up approach where local features are aggregated into a global final representation. Existing frameworks (i.e., bag of words or Fisher vectors) are specifically designed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Meng:2020:ARU, author = "Quanling Meng and Heyan Zhu and Weigang Zhang and Xuefeng Piao and Aijie Zhang", title = "Action Recognition Using Form and Motion Modalities", journal = j-TOMM, volume = "16", number = "1s", pages = "22:1--22:16", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3350840", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3350840", abstract = "Action recognition has attracted increasing interest in computer vision due to its potential applications in many vision systems. One of the main challenges in action recognition is to extract powerful features from videos. Most existing approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shamsolmoali:2020:AAM, author = "Pourya Shamsolmoali and Masoumeh Zareapoor and Huiyu Zhou and Jie Yang", title = "{AMIL}: Adversarial Multi-instance Learning for Human Pose Estimation", journal = j-TOMM, volume = "16", number = "1s", pages = "23:1--23:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355612", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3355612", abstract = "Human pose estimation has an important impact on a wide range of applications, from human-computer interface to surveillance and content-based video retrieval. For human pose estimation, joint obstructions and overlapping upon human bodies result in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhuang:2020:MAR, author = "Yueting Zhuang and Dejing Xu and Xin Yan and Wenzhuo Cheng and Zhou Zhao and Shiliang Pu and Jun Xiao", title = "Multichannel Attention Refinement for Video Question Answering", journal = j-TOMM, volume = "16", number = "1s", pages = "24:1--24:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3366710", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3366710", abstract = "Video Question Answering (VideoQA) is the extension of image question answering (ImageQA) in the video domain. Methods are required to give the correct answer after analyzing the provided video and question in this task. Comparing to ImageQA, the most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Grigorev:2020:DDD, author = "Aleksei Grigorev and Shaohui Liu and Zhihong Tian and Jianxin Xiong and Seungmin Rho and Jiang Feng", title = "Delving Deeper in Drone-Based Person Re-Id by Employing Deep Decision Forest and Attributes Fusion", journal = j-TOMM, volume = "16", number = "1s", pages = "25:1--25:15", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3360050", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3360050", abstract = "Deep learning has revolutionized the field of computer vision and image processing. Its ability to extract the compact image representation has taken the person re-identification (re-id) problem to a new level. However, in most cases, researchers are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2020:SPG, author = "Zhaoju Li and Zongwei Zhou and Nan Jiang and Zhenjun Han and Junliang Xing and Jianbin Jiao", title = "Spatial Preserved Graph Convolution Networks for Person Re-identification", journal = j-TOMM, volume = "16", number = "1s", pages = "26:1--26:14", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3362988", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3362988", abstract = "Person Re-identification is a very challenging task due to inter-class ambiguity caused by similar appearances, and large intra-class diversity caused by viewpoints, illuminations, and poses. To address these challenges, in this article, a graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2020:AAC, author = "Hui Chen and Guiguang Ding and Zijia Lin and Sicheng Zhao and Xiaopeng Gu and Wenyuan Xu and Jungong Han", title = "{ACMNet}: Adaptive Confidence Matching Network for Human Behavior Analysis via Cross-modal Retrieval", journal = j-TOMM, volume = "16", number = "1s", pages = "27:1--27:21", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3362065", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3362065", abstract = "Cross-modality human behavior analysis has attracted much attention from both academia and industry. In this article, we focus on the cross-modality image-text retrieval problem for human behavior analysis, which can learn a common latent space for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:MSS, author = "Anran Zhang and Xiaolong Jiang and Baochang Zhang and Xianbin Cao", title = "Multi-scale Supervised Attentive Encoder--Decoder Network for Crowd Counting", journal = j-TOMM, volume = "16", number = "1s", pages = "28:1--28:20", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3356019", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3356019", abstract = "Crowd counting is a popular topic with widespread applications. Currently, the biggest challenge to crowd counting is large-scale variation in objects. In this article, we focus on overcoming this challenge by proposing a novel Attentive Encoder-Decoder \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tanveer:2020:ISI, author = "M. Tanveer and P. Khanna and M. Prasad and C. T. Lin", title = "Introduction to the Special Issue on Computational Intelligence for Biomedical Data and Imaging", journal = j-TOMM, volume = "16", number = "1s", pages = "29:1--29:4", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3381919", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3381919", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tanveer:2020:MLT, author = "M. Tanveer and B. Richhariya and R. U. Khan and A. H. Rashid and P. Khanna and M. Prasad and C. T. Lin", title = "Machine Learning Techniques for the Diagnosis of {Alzheimer}'s Disease: a Review", journal = j-TOMM, volume = "16", number = "1s", pages = "30:1--30:35", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3344998", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3344998", abstract = "Alzheimer's disease is an incurable neurodegenerative disease primarily affecting the elderly population. Efficient automated techniques are needed for early diagnosis of Alzheimer's. Many novel approaches are proposed by researchers for classification \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yadav:2020:EDA, author = "Shweta Yadav and Pralay Ramteke and Asif Ekbal and Sriparna Saha and Pushpak Bhattacharyya", title = "Exploring Disorder-Aware Attention for Clinical Event Extraction", journal = j-TOMM, volume = "16", number = "1s", pages = "31:1--31:21", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372328", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372328", abstract = "Event extraction is one of the crucial tasks in biomedical text mining that aims to extract specific information concerning incidents embedded in the texts. In this article, we propose a deep learning framework that aims to identify the attributes \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tripathi:2020:CNC, author = "Suvidha Tripathi and Satish Kumar Singh", title = "Cell Nuclei Classification in Histopathological Images using {Hybrid O L ConvNet}", journal = j-TOMM, volume = "16", number = "1s", pages = "32:1--32:22", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3345318", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3345318", abstract = "Computer-aided histopathological image analysis for cancer detection is a major research challenge in the medical domain. Automatic detection and classification of nuclei for cancer diagnosis impose a lot of challenges in developing state-of-the-art \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2020:DSS, author = "Nengjun Zhu and Jian Cao and Kunwei Shen and Xiaosong Chen and Siji Zhu", title = "A Decision Support System with Intelligent Recommendation for Multi-disciplinary Medical Treatment", journal = j-TOMM, volume = "16", number = "1s", pages = "33:1--33:23", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3352573", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3352573", abstract = "Recent years have witnessed an emerging trend for improving disease treatment by forming multi-disciplinary medical teams. The collaboration among specialists from multiple medical domains has been shown to be significantly helpful for designing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:RFS, author = "Qingyong Wang and Yun Zhou and Weiping Ding and Zhiguo Zhang and Khan Muhammad and Zehong Cao", title = "Random Forest with Self-Paced Bootstrap Learning in Lung Cancer Prognosis", journal = j-TOMM, volume = "16", number = "1s", pages = "34:1--34:12", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3345314", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3345314", abstract = "Training gene expression data with supervised learning approaches can provide an alarm sign for early treatment of lung cancer to decrease death rates. However, the samples of gene features involve lots of noises in a realistic environment. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Saini:2020:TEB, author = "Naveen Saini and Sriparna Saha and Pushpak Bhattacharyya and Himanshu Tuteja", title = "Textual Entailment-Based Figure Summarization for Biomedical Articles", journal = j-TOMM, volume = "16", number = "1s", pages = "35:1--35:24", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357334", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357334", abstract = "This article proposes a novel unsupervised approach (FigSum++) for automatic figure summarization in biomedical scientific articles using a multi-objective evolutionary algorithm. The problem is treated as an optimization problem where relevant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tong:2020:PND, author = "Chao Tong and Baoyu Liang and Mengze Zhang and Rongshan Chen and Arun Kumar Sangaiah and Zhigao Zheng and Tao Wan and Chenyang Yue and Xinyi Yang", title = "Pulmonary Nodule Detection Based on {ISODATA}-Improved Faster {RCNN} and {$3$D-CNN} with Focal Loss", journal = j-TOMM, volume = "16", number = "1s", pages = "36:1--36:9", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3365445", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3365445", abstract = "The early diagnosis of pulmonary cancer can significantly improve the survival rate of patients, where pulmonary nodules detection in computed tomography images plays an important role. In this article, we propose a novel pulmonary nodule detection \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Agrawal:2020:HWB, author = "Utkarsh Agrawal and Jatin Arora and Rahul Singh and Deepak Gupta and Ashish Khanna and Aditya Khamparia", title = "Hybrid Wolf--Bat Algorithm for Optimization of Connection Weights in Multi-layer Perceptron", journal = j-TOMM, volume = "16", number = "1s", pages = "37:1--37:20", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3350532", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3350532", abstract = "In a neural network, the weights act as parameters to determine the output(s) from a set of inputs. The weights are used to find the activation values of nodes of a layer from the values of the previous layer. Finding the ideal set of these weights for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rout:2020:ICA, author = "Ranjeet Kumar Rout and Sk. Sarif Hassan and Sanchit Sindhwani and Hari Mohan Pandey and Saiyed Umer", title = "Intelligent Classification and Analysis of Essential Genes Using Quantitative Methods", journal = j-TOMM, volume = "16", number = "1s", pages = "38:1--38:21", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3343856", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3343856", abstract = "Essential genes are considered to be the genes required to sustain life of different organisms. These genes encode proteins that maintain central metabolism, DNA replications, translation of genes, and basic cellular structure, and mediate the transport \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:ABM, author = "Hongyi Zhang and Haoke Zhang and Sandeep Pirbhulal and Wanqing Wu and Victor Hugo C. {De Albuquerque}", title = "Active Balancing Mechanism for Imbalanced Medical Data in Deep Learning-Based Classification Models", journal = j-TOMM, volume = "16", number = "1s", pages = "39:1--39:15", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357253", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Apr 30 10:35:21 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357253", abstract = "Imbalanced data always has a serious impact on a predictive model, and most under-sampling techniques consume more time and suffer from loss of samples containing critical information during imbalanced data processing, especially in the biomedical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Vellingiri:2020:SCB, author = "Shanthi Vellingiri and Ryan P. McMahan and Balakrishnan Prabhakaran", title = "{SCeVE}: a Component-based Framework to Author Mixed Reality Tours", journal = j-TOMM, volume = "16", number = "2", pages = "40:1--40:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377353", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377353", abstract = "Authoring a collaborative, interactive Mixed Reality (MR) tour requires flexible design and development of various software modules for tasks such as managing geographically distributed participants, adaptable travel and virtual camera techniques, data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2020:BDC, author = "Jiaying Liu and Sijie Song and Chunhui Liu and Yanghao Li and Yueyu Hu", title = "A Benchmark Dataset and Comparison Study for Multi-modal Human Action Analytics", journal = j-TOMM, volume = "16", number = "2", pages = "41:1--41:24", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3365212", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3365212", abstract = "Large-scale benchmarks provide a solid foundation for the development of action analytics. Most of the previous activity benchmarks focus on analyzing actions in RGB videos. There is a lack of large-scale and high-quality benchmarks for multi-modal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duan:2020:EFE, author = "Mingxing Duan and Kenli Li and Aijia Ouyang and Khin Nandar Win and Keqin Li and Qi Tian", title = "{EGroupNet}: a Feature-enhanced Network for Age Estimation with Novel Age Group Schemes", journal = j-TOMM, volume = "16", number = "2", pages = "42:1--42:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3379449", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3379449", abstract = "Although age estimation is easily affected by smiling, race, gender, and other age-related attributes, most of the researchers did not pay attention to the correlations among these attributes. Moreover, many researchers perform age estimation from a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Baez-Suarez:2020:SSS, author = "Abraham B{\'a}ez-Su{\'a}rez and Nolan Shah and Juan Arturo Nolazco-Flores and Shou-Hsuan S. Huang and Omprakash Gnawali and Weidong Shi", title = "{SAMAF}: Sequence-to-sequence Autoencoder Model for Audio Fingerprinting", journal = j-TOMM, volume = "16", number = "2", pages = "43:1--43:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3380828", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3380828", abstract = "Audio fingerprinting techniques were developed to index and retrieve audio samples by comparing a content-based compact signature of the audio instead of the entire audio sample, thereby reducing memory and computational expense. Different techniques \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mettes:2020:SIB, author = "Pascal Mettes and Dennis C. Koelma and Cees G. M. Snoek", title = "Shuffled {ImageNet} Banks for Video Event Detection and Search", journal = j-TOMM, volume = "16", number = "2", pages = "44:1--44:21", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377875", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377875", abstract = "This article aims for the detection and search of events in videos, where video examples are either scarce or even absent during training. To enable such event detection and search, ImageNet concept banks have shown to be effective. Rather than \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Noori:2020:HAR, author = "Farzan Majeed Noori and Michael Riegler and Md Zia Uddin and Jim Torresen", title = "Human Activity Recognition from Multiple Sensors Data Using Multi-fusion Representations and {CNNs}", journal = j-TOMM, volume = "16", number = "2", pages = "45:1--45:19", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377882", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377882", abstract = "With the emerging interest in the ubiquitous sensing field, it has become possible to build assistive technologies for persons during their daily life activities to provide personalized feedback and services. For instance, it is possible to detect an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rossi:2020:DUB, author = "Silvia Rossi and Cagri Ozcinar and Aljosa Smolic and Laura Toni", title = "Do Users Behave Similarly in {VR}? {Investigation} of the User Influence on the System Design", journal = j-TOMM, volume = "16", number = "2", pages = "46:1--46:26", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3381846", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3381846", abstract = "With the overarching goal of developing user-centric Virtual Reality (VR) systems, a new wave of studies focused on understanding how users interact in VR environments has recently emerged. Despite the intense efforts, however, current literature still \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:LLF, author = "Xiao Wang and Wu Liu and Jun Chen and Xiaobo Wang and Chenggang Yan and Tao Mei", title = "Listen, Look, and Find the One: Robust Person Search with Multimodality Index", journal = j-TOMM, volume = "16", number = "2", pages = "47:1--47:20", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3380549", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3380549", abstract = "Person search with one portrait, which attempts to search the targets in arbitrary scenes using one portrait image at a time, is an essential yet unexplored problem in the multimedia field. Existing approaches, which predominantly depend on the visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Luo:2020:FFI, author = "Xiaofan Luo and Fukoeng Wong and Haifeng Hu", title = "{FIN}: Feature Integrated Network for Object Detection", journal = j-TOMM, volume = "16", number = "2", pages = "48:1--48:18", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3381086", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3381086", abstract = "Multi-layer detection is a widely used method in the field of object detection. It extracts multiple feature maps with different resolutions from the backbone network to detect objects of different scales, which can effectively cope with the problem of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Akpinar:2020:PPP, author = "Kutalmis Akpinar and Kien A. Hua", title = "{PPNet}: Privacy Protected {CDN--ISP} Collaboration for {QoS}-aware Multi-{CDN} Adaptive Video Streaming", journal = j-TOMM, volume = "16", number = "2", pages = "49:1--49:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3379983", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3379983", abstract = "Software-defined networking introduces opportunities to optimize the Internet Service Provider's network and to improve client experience for the Video-on-Demand applications. Recent studies on SDN frameworks show that traffic engineering methods allow \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tanwar:2020:CPP, author = "Vishesh Kumar Tanwar and Balasubramanian Raman and Amitesh Singh Rajput and Rama Bhargava", title = "{CryptoLesion}: a Privacy-preserving Model for Lesion Segmentation Using Whale Optimization over Cloud", journal = j-TOMM, volume = "16", number = "2", pages = "50:1--50:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3380743", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3380743", abstract = "The low-cost, accessing flexibility, agility, and mobility of cloud infrastructures have attracted medical organizations to store their high-resolution data in encrypted form. Besides storage, these infrastructures provide various image processing services for plain (non-encrypted) images. Meanwhile, the privacy and security of uploaded data depend upon the reliability of the service provider(s). The enforcement of laws towards privacy policies in health-care organizations, for not disclosing their patient's sensitive and private medical information, restrict them to utilize these services. To address these privacy concerns for melanoma detection, we propose CryptoLesion, a privacy-preserving model for segmenting lesion region using whale optimization algorithm (WOA) over the cloud in the encrypted domain (ED). The user's image is encrypted using a permutation ordered binary number system and a random stumble matrix. The task of segmentation is accomplished by dividing an encrypted image into a pre-defined number of clusters whose optimal centroids are obtained by WOA in ED, followed by the assignment of each pixel of an encrypted image to the unique centroid. The qualitative and quantitative analysis of CryptoLesion is evaluated over publicly available datasets provided in The International Skin Imaging Collaboration Challenges in 2016, 2017, 2018, and PH2 dataset. The segmented results obtained by CryptoLesion are found to be comparable with the winners of respective challenges. CryptoLesion is proved to be secure from a probabilistic viewpoint and various cryptographic attacks. To the best of our knowledge, CryptoLesion is first moving towards the direction of lesion segmentation in ED.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2020:DPC, author = "Zhedong Zheng and Liang Zheng and Michael Garrett and Yi Yang and Mingliang Xu and Yi-Dong Shen", title = "Dual-path Convolutional Image-Text Embeddings with Instance Loss", journal = j-TOMM, volume = "16", number = "2", pages = "51:1--51:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3383184", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3383184", abstract = "Matching images and sentences demands a fine understanding of both modalities. In this article, we propose a new system to discriminatively embed the image and text to a shared visual-textual space. In this field, most existing works apply the ranking \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2020:MPA, author = "Xiaowen Huang and Shengsheng Qian and Quan Fang and Jitao Sang and Changsheng Xu", title = "Meta-path Augmented Sequential Recommendation with Contextual Co-attention Network", journal = j-TOMM, volume = "16", number = "2", pages = "52:1--52:24", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3382180", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3382180", abstract = "It is critical to comprehensively and efficiently learn user preferences for an effective sequential recommender system. Existing sequential recommendation methods mainly focus on modeling local preference from users' historical behaviors, which largely \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2020:IMC, author = "Lingxiang Wu and Min Xu and Shengsheng Qian and Jianwei Cui", title = "Image to Modern {Chinese} Poetry Creation via a Constrained Topic-aware Model", journal = j-TOMM, volume = "16", number = "2", pages = "53:1--53:21", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3381858", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3381858", abstract = "Artificial creativity has attracted increasing research attention in the field of multimedia and artificial intelligence. Despite the promising work on poetry/painting/music generation, creating modern Chinese poetry from images, which can significantly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2020:RLV, author = "Zhili Zhou and Q. M. Jonathan Wu and Yimin Yang and Xingming Sun", title = "Region-Level Visual Consistency Verification for Large-Scale Partial-Duplicate Image Search", journal = j-TOMM, volume = "16", number = "2", pages = "54:1--54:25", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3383582", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3383582", abstract = "Most recent large-scale image search approaches build on a bag-of-visual-words model, in which local features are quantized and then efficiently matched between images. However, the limited discriminability of local features and the BOW quantization \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2020:STS, author = "Jiale He and Gaobo Yang and Xin Liu and Xiangling Ding", title = "Spatio-temporal Saliency-based Motion Vector Refinement for Frame Rate Up-conversion", journal = j-TOMM, volume = "16", number = "2", pages = "55:1--55:18", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3382506", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3382506", abstract = "A spatio-temporal saliency-based frame rate up-conversion (FRUC) approach is proposed, which achieves better quality of interpolated frames and invalidates existing texture variation-based FRUC detectors. A spatio-temporal saliency model is designed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gelli:2020:LVE, author = "Francesco Gelli and Tiberio Uricchio and Xiangnan He and Alberto {Del Bimbo} and Tat-Seng Chua", title = "Learning Visual Elements of Images for Discovery of Brand Posts", journal = j-TOMM, volume = "16", number = "2", pages = "56:1--56:21", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3385413", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3385413", abstract = "Online Social Network Sites have become a primary platform for brands and organizations to engage their audience by sharing image and video posts on their timelines. Different from traditional advertising, these posts are not restricted to the products \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2020:HRR, author = "Xian-Hua Han and Yinqiang Zheng and Jiande Sun and Yen-Wei Chen", title = "Hyperspectral Reconstruction with Redundant Camera Spectral Sensitivity Functions", journal = j-TOMM, volume = "16", number = "2", pages = "57:1--57:15", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3386313", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 16 10:45:32 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3386313", abstract = "High-resolution hyperspectral (HS) reconstruction has recently achieved significantly progress, among which the method based on the fusion of the RGB and HS images of the same scene can greatly improve the reconstruction performance compared with those \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2020:ISI, author = "Honghao Gao and Yudong Zhang", title = "Introduction to the Special Issue on Smart Communications and Networking for Future Video Surveillance", journal = j-TOMM, volume = "16", number = "2s", pages = "58:1--58:2", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3398382", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3398382", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiang:2020:SDM, author = "Yizhang Jiang and Xiaoqing Gu and Dingcheng Ji and Pengjiang Qian and Jing Xue and Yuanpeng Zhang and Jiaqi Zhu and Kaijian Xia and Shitong Wang", title = "Smart Diagnosis: a Multiple-Source Transfer {TSK} Fuzzy System for {EEG} Seizure Identification", journal = j-TOMM, volume = "16", number = "2s", pages = "59:1--59:21", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3340240", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3340240", abstract = "To effectively identify electroencephalogram (EEG) signals in multiple-source domains, a multiple-source transfer learning-based Takagi-Sugeno-Kang (TSK) fuzzy system (FS), called MST-TSK, is proposed, which combines multiple-source transfer learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:DBD, author = "Shui-Hua Wang and Yu-Dong Zhang", title = "{DenseNet-201}-Based Deep Neural Network with Composite Learning Factor and Precomputation for Multiple Sclerosis Classification", journal = j-TOMM, volume = "16", number = "2s", pages = "60:1--60:19", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3341095", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3341095", abstract = "(Aim) Multiple sclerosis is a neurological condition that may cause neurologic disability. Convolutional neural network can achieve good results, but tuning hyperparameters of CNN needs expert knowledge and are difficult and time-consuming. To identify \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xia:2020:CDB, author = "Kaijian Xia and Hongsheng Yin and Yong Jin and Shi Qiu and Hongru Zhao", title = "Cross-Domain Brain {CT} Image Smart Segmentation via Shared Hidden Space Transfer {FCM} Clustering", journal = j-TOMM, volume = "16", number = "2s", pages = "61:1--61:21", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357233", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357233", abstract = "Clustering is an important issue in brain medical image segmentation. Original medical images used for clinical diagnosis are often insufficient for clustering in the current domain. As there are sufficient medical images in the related domains, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2020:STD, author = "Yonggang Li and Chunping Liu and Yi Ji and Shengrong Gong and Haibao Xu", title = "Spatio-Temporal Deep Residual Network with Hierarchical Attentions for Video Event Recognition", journal = j-TOMM, volume = "16", number = "2s", pages = "62:1--62:21", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3378026", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3378026", abstract = "Event recognition in surveillance video has gained extensive attention from the computer vision community. This process still faces enormous challenges due to the tiny inter-class variations that are caused by various facets, such as severe occlusion, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Si:2020:MLT, author = "Wen Si and Cong Liu and Zhongqin Bi and Meijing Shan", title = "Modeling Long-Term Dependencies from Videos Using Deep Multiplicative Neural Networks", journal = j-TOMM, volume = "16", number = "2s", pages = "63:1--63:19", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357797", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3357797", abstract = "Understanding temporal dependencies of videos is fundamental for vision problems, but deep learning-based models are still insufficient in this field. In this article, we propose a novel deep multiplicative neural network (DMNN) for learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2020:PCA, author = "Suguo Zhu and Xiaoxian Yang and Jun Yu and Zhenying Fang and Meng Wang and Qingming Huang", title = "Proposal Complementary Action Detection", journal = j-TOMM, volume = "16", number = "2s", pages = "64:1--64:12", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3361845", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3361845", abstract = "Temporal action detection not only requires correct classification but also needs to detect the start and end times of each action accurately. However, traditional approaches always employ sliding windows or actionness to predict the actions, and it is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2020:NTF, author = "Chenxi Huang and Yisha Lan and Guokai Zhang and Gaowei Xu and Landu Jiang and Nianyin Zeng and Jenhong Tan and E. Y. K. Ng and Yongqiang Cheng and Ningzhi Han and Rongrong Ji and Yonghong Peng", title = "A New Transfer Function for Volume Visualization of Aortic Stent and Its Application to Virtual Endoscopy", journal = j-TOMM, volume = "16", number = "2s", pages = "65:1--65:14", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3373358", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3373358", abstract = "Aortic stent has been widely used in restoring vascular stenosis and assisting patients with cardiovascular disease. The effective visualization of aortic stent is considered to be critical to ensure the effectiveness and functions of the aortic stent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zink:2020:IBP, author = "Michael Zink and Laura Toni and Ali C. Begen", title = "Introduction to the Best Papers from the {ACM Multimedia Systems (MMSys) 2019 and Co-Located Workshops}", journal = j-TOMM, volume = "16", number = "2s", pages = "66:1--66:2", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3398384", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3398384", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:PLB, author = "Rui-Xiao Zhang and Ming Ma and Tianchi Huang and Haitian Pang and Xin Yao and Chenglei Wu and Lifeng Sun", title = "A Practical Learning-based Approach for Viewer Scheduling in the Crowdsourced Live Streaming", journal = j-TOMM, volume = "16", number = "2s", pages = "67:1--67:22", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397226", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3397226", abstract = "Scheduling viewers effectively among different Content Delivery Network (CDN) providers is challenging owing to the extreme diversity in the crowdsourced live streaming (CLS) scenarios. Abundant algorithms have been proposed in recent years, which, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Altamimi:2020:QFD, author = "Sa'di Altamimi and Shervin Shirmohammadi", title = "{QoE}-Fair {DASH} Video Streaming Using Server-side Reinforcement Learning", journal = j-TOMM, volume = "16", number = "2s", pages = "68:1--68:21", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397227", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3397227", abstract = "To design an optimal adaptive video streaming method, video service providers need to consider both the efficiency and the fairness of the Quality of Experience (QoE) of their users. In Reference [8], we proposed a server-side QoE-fair rate adaptation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bentaleb:2020:PAA, author = "Abdelhak Bentaleb and Christian Timmerer and Ali C. Begen and Roger Zimmermann", title = "Performance Analysis of {ACTE}: a Bandwidth Prediction Method for Low-latency Chunked Streaming", journal = j-TOMM, volume = "16", number = "2s", pages = "69:1--69:24", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3387921", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3387921", abstract = "HTTP adaptive streaming with chunked transfer encoding can offer low-latency streaming without sacrificing the coding efficiency. This allows media segments to be delivered while still being packaged. However, conventional schemes often make widely \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pham:2020:ESR, author = "Stefan Pham and Patrick Heeren and Calvin Schmidt and Daniel Silhavy and Stefan Arbanowski", title = "Evaluation of Shared Resource Allocation Using {SAND} for {ABR} Streaming", journal = j-TOMM, volume = "16", number = "2s", pages = "70:1--70:18", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388926", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3388926", abstract = "Adaptive bitrate media streaming clients adjust the quality of media content depending on the current network conditions. The shared resource allocation (SRA) feature defined in MPEG-SAND (server and network assisted DASH) allows servers to allocate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gutterman:2020:RRT, author = "Craig Gutterman and Katherine Guo and Sarthak Arora and Trey Gilliland and Xiaoyang Wang and Les Wu and Ethan Katz-Bassett and Gil Zussman", title = "{Requet}: Real-Time {QoE} Metric Detection for Encrypted {YouTube} Traffic", journal = j-TOMM, volume = "16", number = "2s", pages = "71:1--71:28", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3394498", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3394498", abstract = "As video traffic dominates the Internet, it is important for operators to detect video quality of experience (QoE) to ensure adequate support for video traffic. With wide deployment of end-to-end encryption, traditional deep packet inspection-based traffic monitoring approaches are becoming ineffective. This poses a challenge for network operators to monitor user QoE and improve upon their experience. To resolve this issue, we develop and present a system for REal-time QUality of experience metric detection for Encrypted Traffic --- Requet --- which is suitable for network middlebox deployment. Requet uses a detection algorithm that we develop to identify video and audio chunks from the IP headers of encrypted traffic. Features extracted from the chunk statistics are used as input to a machine learning algorithm to predict QoE metrics, specifically buffer warning (low buffer, high buffer), video state (buffer increase, buffer decay, steady, stall), and video resolution. We collect a large YouTube dataset consisting of diverse video assets delivered over various WiFi and LTE network conditions to evaluate the performance. We compare Requet with a baseline system based on previous work and show that Requet outperforms the baseline system in accuracy of predicting buffer low warning, video state, and video resolution by $ 1.12 \times $, $ 1.53 \times $, and $ 3.14 \times $, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2020:ATL, author = "Xinjue Hu and Jingming Shan and Yu Liu and Lin Zhang and Shervin Shirmohammadi", title = "An Adaptive Two-Layer Light Field Compression Scheme Using {GNN}-Based Reconstruction", journal = j-TOMM, volume = "16", number = "2s", pages = "72:1--72:23", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3395620", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3395620", abstract = "As a new form of volumetric media, Light Field (LF) can provide users with a true six degrees of freedom immersive experience because LF captures the scene with photo-realism, including aperture-limited changes in viewpoint. But uncompressed LF data is too large for network transmission, which is the reason why LF compression has become an important research topic. One of the more recent approaches for LF compression is to reduce the angular resolution of the input LF during compression and to use LF reconstruction to recover the discarded viewpoints during decompression. Following this approach, we propose a new LF reconstruction algorithm based on Graph Neural Networks; we show that it can achieve higher compression and better quality compared to existing reconstruction methods, although suffering from the same problem as those methods --- the inability to deal effectively with high-frequency image components. To solve this problem, we propose an adaptive two-layer compression architecture that separates high-frequency and low-frequency components and compresses each with a different strategy so that the performance can become robust and controllable. Experiments with multiple datasets show that our proposed scheme is capable of providing a decompression quality of above 40 dB, and can significantly improve compression efficiency compared with similar LF reconstruction schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Claypool:2020:IMD, author = "Mark Claypool and Andy Cockburn and Carl Gutwin", title = "The Impact of Motion and Delay on Selecting Game Targets with a Mouse", journal = j-TOMM, volume = "16", number = "2s", pages = "73:1--73:24", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3390464", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sun Jul 19 08:56:56 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3390464", abstract = "All real-time computer games, particularly networked computer games, have a delay from when a player starts an action (e.g., clicking the mouse) until the game renders the result (e.g., firing a projectile). This delay can degrade both player \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Anonymous:2020:TCO, author = "Anonymous", title = "Table of Contents: Online Supplement Volume 16, Number 1s", journal = j-TOMM, volume = "16", number = "3", pages = "74:1--74:5", month = sep, year = "2020", DOI = "https://doi.org/10.1145/3409367", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:45:43 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3409367", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Yang:2020:CLR, author = "Liang Yang and Haifeng Hu and Songlong Xing and Xinlong Lu", title = "Constrained {LSTM} and Residual Attention for Image Captioning", journal = j-TOMM, volume = "16", number = "3", pages = "75:1--75:18", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3386725", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3386725", abstract = "Visual structure and syntactic structure are essential in images and texts, respectively. Visual structure depicts both entities in an image and their interactions, whereas syntactic structure in texts can reflect the part-of-speech constraints between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2020:DTN, author = "Donghuo Zeng and Yi Yu and Keizo Oyama", title = "Deep Triplet Neural Networks with Cluster-{CCA} for Audio-Visual Cross-Modal Retrieval", journal = j-TOMM, volume = "16", number = "3", pages = "76:1--76:23", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3387164", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3387164", abstract = "Cross-modal retrieval aims to retrieve data in one modality by a query in another modality, which has been a very interesting research issue in the field of multimedia, information retrieval, and computer vision, and database. Most existing works focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Su:2020:MVG, author = "Yu-Ting Su and Wen-Hui Li and Wei-Zhi Nie and An-An Liu", title = "Multi-View Graph Matching for {$3$D} Model Retrieval", journal = j-TOMM, volume = "16", number = "3", pages = "77:1--77:20", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3387920", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3387920", abstract = "3D model retrieval has been widely utilized in numerous domains, such as computer-aided design, digital entertainment, and virtual reality. Recently, many graph-based methods have been proposed to address this task by using multi-view information of 3D \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fan:2020:RAN, author = "Hehe Fan and Linchao Zhu and Yi Yang and Fei Wu", title = "Recurrent Attention Network with Reinforced Generator for Visual Dialog", journal = j-TOMM, volume = "16", number = "3", pages = "78:1--78:16", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3390891", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3390891", abstract = "In Visual Dialog, an agent has to parse temporal context in the dialog history and spatial context in the image to hold a meaningful dialog with humans. For example, to answer ``what is the man on her left wearing?'' the agent needs to (1) analyze the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2020:ABM, author = "Feiran Huang and Kaimin Wei and Jian Weng and Zhoujun Li", title = "Attention-Based Modality-Gated Networks for Image-Text Sentiment Analysis", journal = j-TOMM, volume = "16", number = "3", pages = "79:1--79:19", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388861", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3388861", abstract = "Sentiment analysis of social multimedia data has attracted extensive research interest and has been applied to many tasks, such as election prediction and products evaluation. Sentiment analysis of one modality (e.g., text or image) has been broadly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:PSE, author = "Shangfei Wang and Longfei Hao and Qiang Ji", title = "Posed and Spontaneous Expression Distinction Using Latent Regression {Bayesian} Networks", journal = j-TOMM, volume = "16", number = "3", pages = "80:1--80:18", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3391290", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3391290", abstract = "Facial spatial patterns can help distinguish between posed and spontaneous expressions, but this information has not been thoroughly leveraged by current studies. We present several latent regression Bayesian networks (LRBNs) to capture the patterns \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2020:UNA, author = "Fangyu Liu and R{\'e}mi Lebret and Didier Orel and Philippe Sordet and Karl Aberer", title = "Upgrading the Newsroom: an Automated Image Selection System for News Articles", journal = j-TOMM, volume = "16", number = "3", pages = "81:1--81:28", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3396520", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3396520", abstract = "We propose an automated image selection system to assist photo editors in selecting suitable images for news articles. The system fuses multiple textual sources extracted from news articles and accepts multilingual inputs. It is equipped with char-level \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2020:FSM, author = "Chenlei Lv and Zhongke Wu and Xingce Wang and Mingquan Zhou", title = "{$3$D} Facial Similarity Measurement and Its Application in Facial Organization", journal = j-TOMM, volume = "16", number = "3", pages = "82:1--82:20", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397765", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3397765", abstract = "We propose a novel framework for 3D facial similarity measurement and its application in facial organization. The construction of the framework is based on Kendall shape space theory. Kendall shape space is a quotient space that is constructed by shape \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2020:ICJ, author = "Jin Yuan and Lei Zhang and Songrui Guo and Yi Xiao and Zhiyong Li", title = "Image Captioning with a Joint Attention Mechanism by Visual Concept Samples", journal = j-TOMM, volume = "16", number = "3", pages = "83:1--83:22", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3394955", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3394955", abstract = "The attention mechanism has been established as an effective method for generating caption words in image captioning; it explores one noticed subregion in an image to predict a related caption word. However, even though the attention mechanism could \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2020:IMP, author = "Xun Wang and Yan Tian and Xuran Zhao and Tao Yang and Judith Gelernter and Jialei Wang and Guohua Cheng and Wei Hu", title = "Improving Multiperson Pose Estimation by Mask-aware Deep Reinforcement Learning", journal = j-TOMM, volume = "16", number = "3", pages = "84:1--84:18", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397340", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3397340", abstract = "Research on single-person pose estimation based on deep neural networks has recently witnessed progress in both accuracy and execution efficiency. However, multiperson pose estimation is still a challenging topic, partially because the object regions \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2020:LJS, author = "Shenming Feng and Haifeng Hu", title = "Learning Joint Structure for Human Pose Estimation", journal = j-TOMM, volume = "16", number = "3", pages = "85:1--85:17", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3392302", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3392302", abstract = "Recently, tremendous progress has been achieved on human pose estimation with the development of convolutional neural networks (CNNs). However, current methods still suffer from severe occlusion, back view, and large pose variation due to the lack of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2020:SSI, author = "Feng Lin and Bin Li and Wengang Zhou and Houqiang Li and Yan Lu", title = "Single-stage Instance Segmentation", journal = j-TOMM, volume = "16", number = "3", pages = "86:1--86:19", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3387926", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3387926", abstract = "Albeit the highest accuracy of object detection is generally acquired by multi-stage detectors, like R-CNN and its extension approaches, the single-stage object detectors also achieve remarkable performance with faster execution and higher scalability. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiang:2020:FSF, author = "Shuqiang Jiang and Weiqing Min and Yongqiang Lyu and Linhu Liu", title = "Few-shot Food Recognition via Multi-view Representation Learning", journal = j-TOMM, volume = "16", number = "3", pages = "87:1--87:20", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3391624", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3391624", abstract = "This article considers the problem of few-shot learning for food recognition. Automatic food recognition can support various applications, e.g., dietary assessment and food journaling. Most existing works focus on food recognition with large numbers of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ho:2020:SGD, author = "Trang-Thi Ho and John Jethro Virtusio and Yung-Yao Chen and Chih-Ming Hsu and Kai-Lung Hua", title = "Sketch-guided Deep Portrait Generation", journal = j-TOMM, volume = "16", number = "3", pages = "88:1--88:18", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3396237", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3396237", abstract = "Generating a realistic human class image from a sketch is a unique and challenging problem considering that the human body has a complex structure that must be preserved. Additionally, input sketches often lack important details that are crucial in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Srivastava:2020:DAI, author = "Gargi Srivastava and Rajeev Srivastava", title = "Design, Analysis, and Implementation of Efficient Framework for Image Annotation", journal = j-TOMM, volume = "16", number = "3", pages = "89:1--89:24", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3386249", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3386249", abstract = "In this article, a general framework of image annotation is proposed by involving salient object detection (SOD), feature extraction, feature selection, and multi-label classification. For SOD, Augmented-Gradient Vector Flow (A-GVF) is proposed, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2020:KAN, author = "Dongyang Zhang and Jie Shao and Heng Tao Shen", title = "Kernel Attention Network for Single Image Super-Resolution", journal = j-TOMM, volume = "16", number = "3", pages = "90:1--90:15", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3398685", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3398685", abstract = "Recently, attention mechanisms have shown a developing tendency toward convolutional neural network (CNN), and some representative attention mechanisms, i.e., channel attention (CA) and spatial attention (SA) have been fully applied to single image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2020:BIQ, author = "Yutao Liu and Ke Gu and Xiu Li and Yongbing Zhang", title = "Blind Image Quality Assessment by Natural Scene Statistics and Perceptual Characteristics", journal = j-TOMM, volume = "16", number = "3", pages = "91:1--91:91", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3414837", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3414837", abstract = "Opinion-unaware blind image quality assessment (OU BIQA) refers to establishing a blind quality prediction model without using the expensive subjective quality scores, which is a highly promising direction in the BIQA research. In this article, we focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Francis:2020:UTF, author = "Jobin Francis and Baburaj M. and Sudhish N. George", title = "A Unified Tensor Framework for Clustering and Simultaneous Reconstruction of Incomplete Imaging Data", journal = j-TOMM, volume = "16", number = "3", pages = "92:1--92:24", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3399806", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Sep 5 18:46:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3399806", abstract = "Incomplete observations in the data are always troublesome to data clustering algorithms. In fact, most of the well-received techniques are not designed to encounter such imperative scenarios. Hence, clustering of images under incomplete samples is an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sharma:2021:ISI, author = "Suraj Sharma and Xuyun Zhang and Hesham El-Sayed and Zhiyuan Tan", title = "Introduction to the Special Issue on Privacy and Security in Evolving {Internet of Multimedia Things}", journal = j-TOMM, volume = "16", number = "3s", pages = "93:1--93:3", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3423955", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3423955", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:LBO, author = "Xiaolong Xu and Qihe Huang and Yiwen Zhang and Shancang Li and Lianyong Qi and Wanchun Dou", title = "An {LSH}-based Offloading Method for {IoMT} Services in Integrated Cloud-Edge Environment", journal = j-TOMM, volume = "16", number = "3s", pages = "94:1--94:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408319", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408319", abstract = "Benefiting from the massive available data provided by Internet of multimedia things (IoMT), enormous intelligent services requiring information of various types to make decisions are emerging. Generally, the IoMT devices are equipped with limited \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gati:2021:DPT, author = "Nicholaus J. Gati and Laurence T. Yang and Jun Feng and Yijun Mo and Mamoun Alazab", title = "Differentially Private Tensor Train Deep Computation for {Internet of Multimedia Things}", journal = j-TOMM, volume = "16", number = "3s", pages = "95:1--95:20", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3421276", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3421276", abstract = "The significant growth of the Internet of Things (IoT) takes a key and active role in healthcare, smart homes, smart manufacturing, and wearable gadgets. Due to complexness and difficulty in processing multimedia data, the IoT based scheme, namely \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2021:FBS, author = "Haoran Liang and Jun Wu and Xi Zheng and Mengshi Zhang and Jianhua Li and Alireza Jolfaei", title = "Fog-based Secure Service Discovery for {Internet of Multimedia Things}: a Cross-blockchain Approach", journal = j-TOMM, volume = "16", number = "3s", pages = "96:1--96:23", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3415151", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3415151", abstract = "The Internet of Multimedia Things (IoMT) has become the backbone of innumerable multimedia applications in various fields. The wide application of IoMT not only makes our life convenient but also brings challenges to service discovery. Service discovery \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2021:ASI, author = "Zhihan Lv and Liang Qiao and Houbing Song", title = "Analysis of the Security of {Internet of Multimedia Things}", journal = j-TOMM, volume = "16", number = "3s", pages = "97:1--97:16", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3398201", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3398201", abstract = "To study the security performance of the Internet of multimedia things on the privacy protection of user identity, behavior trajectory, and preference under the new information technology industry wave, in this study, aiming at the problems of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sahoo:2021:SAD, author = "Kshira Sagar Sahoo and Deepak Puthal", title = "{SDN}-Assisted {DDoS} Defense Framework for the {Internet of Multimedia Things}", journal = j-TOMM, volume = "16", number = "3s", pages = "98:1--98:18", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3394956", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3394956", abstract = "The Internet of Things is visualized as a fundamental networking model that bridges the gap between the cyber and real-world entity. Uniting the real-world object with virtualization technology is opening further opportunities for innovation in nearly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Namasudra:2021:SMU, author = "Suyel Namasudra and Rupak Chakraborty and Abhishek Majumder and Nageswara Rao Moparthi", title = "Securing Multimedia by Using {DNA}-Based Encryption in the Cloud Computing Environment", journal = j-TOMM, volume = "16", number = "3s", pages = "99:1--99:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3392665", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3392665", abstract = "Today, the size of a multimedia file is increasing day by day from gigabytes to terabytes or even petabytes, mainly because of the evolution of a large amount of real-time data. As most of the multimedia files are transmitted through the internet, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fang:2021:PPM, author = "Liming Fang and Changchun Yin and Juncen Zhu and Chunpeng Ge and M. Tanveer and Alireza Jolfaei and Zehong Cao", title = "Privacy Protection for Medical Data Sharing in Smart Healthcare", journal = j-TOMM, volume = "16", number = "3s", pages = "100:1--100:18", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408322", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408322", abstract = "In virtue of advances in smart networks and the cloud computing paradigm, smart healthcare is transforming. However, there are still challenges, such as storing sensitive data in untrusted and controlled infrastructure and ensuring the secure \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2021:DHC, author = "A. K. Singh", title = "Data Hiding: Current Trends, Innovation and Potential Challenges", journal = j-TOMM, volume = "16", number = "3s", pages = "101:1--101:16", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3382772", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3382772", abstract = "With the widespread growth of digital information and improved internet technologies, the demand for improved information security techniques has significantly increased due to privacy leakage, identity theft, illegal copying, and data distribution. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2021:MLM, author = "Hezhen Hu and Wengang Zhou and Xingze Li and Ning Yan and Houqiang Li", title = "{MV2Flow}: Learning Motion Representation for Fast Compressed Video Action Recognition", journal = j-TOMM, volume = "16", number = "3s", pages = "102:1--102:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3422360", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3422360", abstract = "In video action recognition, motion is a very crucial clue, which is usually represented by optical flow. However, optical flow is computationally expensive to obtain, which becomes the bottleneck for the efficiency of traditional action recognition \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cui:2021:SSI, author = "Chaoran Cui and Peiguang Lin and Xiushan Nie and Muwei Jian and Yilong Yin", title = "Social-sensed Image Aesthetics Assessment", journal = j-TOMM, volume = "16", number = "3s", pages = "103:1--103:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3414843", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 22 06:57:30 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3414843", abstract = "Image aesthetics assessment aims to endow computers with the ability to judge the aesthetic values of images, and its potential has been recognized in a variety of applications. Most previous studies perform aesthetics assessment purely based on image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sharma:2021:TCO, author = "Suraj Sharma", title = "Table of Contents: Online Supplement Volume 16, Number 3s", journal = j-TOMM, volume = "16", number = "4", pages = "117e-1:117e-2", month = jan, year = "2021", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 10:01:20 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961", } @Article{Shao:2021:EBR, author = "Huiru Shao and Jing Li and Jia Zhang and Hui Yu and Jiande Sun", title = "Eye-based Recognition for User Identification on Mobile Devices", journal = j-TOMM, volume = "16", number = "4", pages = "117:1--117:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3399659", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3399659", abstract = "User identification is becoming more and more important for Apps on mobile devices. However, the identity recognition based on eyes, e.g., iris recognition, is rarely used on mobile devices comparing with those based on face and fingerprint due to its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:NKT, author = "Zuquan Liu and Guopu Zhu and Yuan-Gen Wang and Jianquan Yang and Sam Kwong", title = "A Novel $ (t, s, k, n)$-Threshold Visual Secret Sharing Scheme Based on Access Structure Partition", journal = j-TOMM, volume = "16", number = "4", pages = "118:1--118:21", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418212", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3418212", abstract = "Visual secret sharing (VSS) is a new technique for sharing a binary image into multiple shadows. For VSS, the original image can be reconstructed from the shadows in any qualified set, but cannot be reconstructed from those in any forbidden set. In most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Becattini:2021:DPA, author = "Federico Becattini and Tiberio Uricchio and Lorenzo Seidenari and Lamberto Ballan and Alberto {Del Bimbo}", title = "Am {I} Done? {Predicting} Action Progress in Videos", journal = j-TOMM, volume = "16", number = "4", pages = "119:1--119:24", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3402447", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3402447", abstract = "In this article, we deal with the problem of predicting action progress in videos. We argue that this is an extremely important task, since it can be valuable for a wide range of interaction applications. To this end, we introduce a novel approach, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ruan:2021:CDI, author = "Weijian Ruan and Chao Liang and Yi Yu and Zheng Wang and Wu Liu and Jun Chen and Jiayi Ma", title = "Correlation Discrepancy Insight Network for Video Re-identification", journal = j-TOMM, volume = "16", number = "4", pages = "120:1--120:21", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3402666", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3402666", abstract = "Video-based person re-identification (ReID) aims at re-identifying a specified person sequence from videos that were captured by disjoint cameras. Most existing works on this task ignore the quality discrepancy across frames by using all video frames to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2021:SSI, author = "Xin Yang and Yu Qiao and Shaozhe Chen and Shengfeng He and Baocai Yin and Qiang Zhang and Xiaopeng Wei and Rynson W. H. Lau", title = "Smart Scribbles for Image Matting", journal = j-TOMM, volume = "16", number = "4", pages = "121:1--121:21", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408323", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408323", abstract = "Image matting is an ill-posed problem that usually requires additional user input, such as trimaps or scribbles. Drawing a fine trimap requires a large amount of user effort, while using scribbles can hardly obtain satisfactory alpha mattes for non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2021:DID, author = "Chenggang Yan and Zhisheng Li and Yongbing Zhang and Yutao Liu and Xiangyang Ji and Yongdong Zhang", title = "Depth Image Denoising Using Nuclear Norm and Learning Graph Model", journal = j-TOMM, volume = "16", number = "4", pages = "122:1--122:17", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3404374", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3404374", abstract = "Depth image denoising is increasingly becoming the hot research topic nowadays, because it reflects the three-dimensional scene and can be applied in various fields of computer vision. But the depth images obtained from depth camera usually contain \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2021:MAS, author = "Lin Zhu and Xiurong Jiang and Jianing Li and Yuanhong Hao and Yonghong Tian", title = "Motion-Aware Structured Matrix Factorization for Foreground Detection in Complex Scenes", journal = j-TOMM, volume = "16", number = "4", pages = "123:1--123:23", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3407188", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3407188", abstract = "Foreground detection is one of the key steps in computer vision applications. Many foreground and background models have been proposed and achieved promising performance in static scenes. However, due to challenges such as dynamic background, irregular \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2021:CNL, author = "Yang Wei and Zhuzhu Wang and Bin Xiao and Ximeng Liu and Zheng Yan and Jianfeng Ma", title = "Controlling Neural Learning Network with Multiple Scales for Image Splicing Forgery Detection", journal = j-TOMM, volume = "16", number = "4", pages = "124:1--124:22", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408299", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408299", abstract = "The guarantee of social stability comes from many aspects of life, and image information security as one of them is being subjected to various malicious attacks. As a means of information attack, image splicing forgery refers to copying some areas of an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2021:VRS, author = "Kun Zeng and Jiangchuan Hu and Yongyi Gong and Kanoksak Wattanachote and Runpeng Yu and Xiaonan Luo", title = "Vertical Retargeting for Stereoscopic Images via Stereo Seam Carving", journal = j-TOMM, volume = "16", number = "4", pages = "125:1--125:22", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408295", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408295", abstract = "Vertical retargeting for stereoscopic images using seam manipulation-based approaches has remained an open challenge over the years. Even though horizontal retargeting had attracted a huge amount of interest, its seam coupling strategies were not \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tian:2021:PIC, author = "Tao Tian and Hanli Wang and Sam Kwong and C.-C. Jay Kuo", title = "Perceptual Image Compression with Block-Level Just Noticeable Difference Prediction", journal = j-TOMM, volume = "16", number = "4", pages = "126:1--126:15", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408320", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408320", abstract = "A block-level perceptual image compression framework is proposed in this work, including a block-level just noticeable difference (JND) prediction model and a preprocessing scheme. Specifically speaking, block-level JND values are first deduced by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2021:MFU, author = "Xin He and Qiong Liu and You Yang", title = "Make Full Use of Priors: Cross-View Optimized Filter for Multi-View Depth Enhancement", journal = j-TOMM, volume = "16", number = "4", pages = "127:1--127:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408293", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408293", abstract = "Multi-view video plus depth (MVD) is the promising and widely adopted data representation for future 3D visual applications and interactive media. However, compression distortions on depth videos impede the development of such applications, and filters \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:AAB, author = "Xiaoxiao Liu and Qingyang Xu", title = "Adaptive Attention-based High-level Semantic Introduction for Image Caption", journal = j-TOMM, volume = "16", number = "4", pages = "128:1--128:22", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3409388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3409388", abstract = "There have been several attempts to integrate a spatial visual attention mechanism into an image caption model and introduce semantic concepts as the guidance of image caption generation. High-level semantic information consists of the abstractedness \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{UlFazal:2021:EIC, author = "Muhammad Abu {Ul Fazal} and Sam Ferguson and Andrew Johnston", title = "Evaluation of Information Comprehension in Concurrent Speech-based Designs", journal = j-TOMM, volume = "16", number = "4", pages = "129:1--129:19", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3409463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3409463", abstract = "In human-computer interaction, particularly in multimedia delivery, information is communicated to users sequentially, whereas users are capable of receiving information from multiple sources concurrently. This mismatch indicates that a sequential mode \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2021:LDA, author = "Yucheng Zhu and Guangtao Zhai and Xiongkuo Min and Jiantao Zhou", title = "Learning a Deep Agent to Predict Head Movement in 360-Degree Images", journal = j-TOMM, volume = "16", number = "4", pages = "130:1--130:23", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410455", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3410455", abstract = "Virtual reality adequately stimulates senses to trick users into accepting the virtual environment. To create a sense of immersion, high-resolution images are required to satisfy human visual system, and low latency is essential for smooth operations, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nie:2021:MMI, author = "Weizhi Nie and Qi Liang and Yixin Wang and Xing Wei and Yuting Su", title = "{MMFN}: Multimodal Information Fusion Networks for {$3$D} Model Classification and Retrieval", journal = j-TOMM, volume = "16", number = "4", pages = "131:1--131:22", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410439", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3410439", abstract = "In recent years, research into 3D shape recognition in the field of multimedia and computer vision has attracted wide attention. With the rapid development of deep learning, various deep models have achieved state-of-the-art performance based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2021:GRC, author = "Zhongying Zhao and Yonghao Yang and Chao Li and Liqiang Nie", title = "{GuessUNeed}: Recommending Courses via Neural Attention Network and Course Prerequisite Relation Embeddings", journal = j-TOMM, volume = "16", number = "4", pages = "132:1--132:17", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410441", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3410441", abstract = "Massive Open Online Courses, offering millions of high-quality courses from prestigious universities and prominent experts, are picking up momentum in popularity. Although users enrolling on MOOCs have free access to abundant knowledge, they may easily \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2021:KDE, author = "Yi Huang and Xiaoshan Yang and Junyu Gao and Jitao Sang and Changsheng Xu", title = "Knowledge-driven Egocentric Multimodal Activity Recognition", journal = j-TOMM, volume = "16", number = "4", pages = "133:1--133:133", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3409332", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3409332", abstract = "Recognizing activities from egocentric multimodal data collected by wearable cameras and sensors, is gaining interest, as multimodal methods always benefit from the complementarity of different modalities. However, since high-dimensional videos contain \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:PBS, author = "Yaoyu Li and Hantao Yao and Tianzhu Zhang and Changsheng Xu", title = "Part-based Structured Representation Learning for Person Re-identification", journal = j-TOMM, volume = "16", number = "4", pages = "134:1--134:22", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3412384", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Feb 10 10:15:11 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3412384", abstract = "Person re-identification aims to match person of interest under non-overlapping camera views. Therefore, how to generate a robust and discriminative representation is crucial for person re-identification. Mining local clues from human body parts to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jin:2021:MTL, author = "Xin Jin and Jianfeng Xu and Kazuyuki Tasaka and Zhibo Chen", title = "Multi-task Learning-based All-in-one Collaboration Framework for Degraded Image Super-resolution", journal = j-TOMM, volume = "17", number = "1", pages = "21:1--21:21", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3417333", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3417333", abstract = "In this article, we address the degraded image super-resolution problem in a multi-task learning (MTL) manner. To better share representations between multiple tasks, we propose an all-in-one collaboration framework (ACF) with a learnable ``junction'' unit \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tran:2021:CQM, author = "Huyen T. T. Tran and Nam Pham Ngoc and Tobias Ho{\ss}feld and Michael Seufert and Truong Cong Thang", title = "Cumulative Quality Modeling for {HTTP} Adaptive Streaming", journal = j-TOMM, volume = "17", number = "1", pages = "22:1--22:24", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3423421", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3423421", abstract = "HTTP Adaptive Streaming has become the de facto choice for multimedia delivery. However, the quality of adaptive video streaming may fluctuate strongly during a session due to throughput fluctuations. So, it is important to evaluate the quality of a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:SVM, author = "Tong Xu and Peilun Zhou and Linkang Hu and Xiangnan He and Yao Hu and Enhong Chen", title = "Socializing the Videos: a Multimodal Approach for Social Relation Recognition", journal = j-TOMM, volume = "17", number = "1", pages = "23:1--23:23", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3416493", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3416493", abstract = "As a crucial task for video analysis, social relation recognition for characters not only provides semantically rich description of video content but also supports intelligent applications, e.g., video retrieval and visual question answering. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2021:RSI, author = "Xuehu Yan and Lintao Liu and Longlong Li and Yuliang Lu", title = "Robust Secret Image Sharing Resistant to Noise in Shares", journal = j-TOMM, volume = "17", number = "1", pages = "24:1--24:22", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3419750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3419750", abstract = "A secret image is split into shares in the generation phase of secret image sharing (SIS) for a threshold. In the recovery phase, the secret image is recovered when any or more shares are collected, and each collected share is generally assumed to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:ANM, author = "Mingliang Xu and Qingfeng Li and Jianwei Niu and Hao Su and Xiting Liu and Weiwei Xu and Pei Lv and Bing Zhou and Yi Yang", title = "{ART-UP}: a Novel Method for Generating Scanning-Robust Aesthetic {QR} Codes", journal = j-TOMM, volume = "17", number = "1", pages = "25:1--25:23", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418214", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3418214", abstract = "Quick response (QR) codes are usually scanned in different environments, so they must be robust to variations in illumination, scale, coverage, and camera angles. Aesthetic QR codes improve the visual quality, but subtle changes in their appearance may \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2021:CIR, author = "Peihao Yang and Linghe Kong and Meikang Qiu and Xue Liu and Guihai Chen", title = "Compressed Imaging Reconstruction with Sparse Random Projection", journal = j-TOMM, volume = "17", number = "1", pages = "26:1--26:25", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447431", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447431", abstract = "As the Internet of Things thrives, monitors and cameras produce tons of image data every day. To efficiently process these images, many compressed imaging frameworks are proposed. A compressed imaging framework comprises two parts, image signal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qi:2021:GNT, author = "Lei Qi and Lei Wang and Jing Huo and Yinghuan Shi and Yang Gao", title = "{GreyReID}: a Novel Two-stream Deep Framework with {RGB}-grey Information for Person Re-identification", journal = j-TOMM, volume = "17", number = "1", pages = "27:1--27:22", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3419439", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3419439", abstract = "In this article, we observe that most false positive images (i.e., different identities with query images) in the top ranking list usually have the similar color information with the query image in person re-identification (Re-ID). Meanwhile, when we use \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chehabeddine:2021:BMH, author = "Said Chehabeddine and Muhammad Hassan Jamil and Wanjoo Park and Dianne L. Sefo and Peter M. Loomer and Mohamad Eid", title = "{Bi}-manual Haptic-based Periodontal Simulation with Finger Support and Vibrotactile Feedback", journal = j-TOMM, volume = "17", number = "1", pages = "28:1--28:17", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3421765", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3421765", abstract = "The rise of virtual reality and haptic technologies has created exciting new applications in medical training and education. In a dental simulation, haptic technology can create the illusion of substances (teeth, gingiva, bone, etc.) by providing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:MHP, author = "Jianshu Li and Jian Zhao and Congyan Lang and Yidong Li and Yunchao Wei and Guodong Guo and Terence Sim and Shuicheng Yan and Jiashi Feng", title = "Multi-human Parsing with a Graph-based Generative Adversarial Model", journal = j-TOMM, volume = "17", number = "1", pages = "29:1--29:21", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418217", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3418217", abstract = "Human parsing is an important task in human-centric image understanding in computer vision and multimedia systems. However, most existing works on human parsing mainly tackle the single-person scenario, which deviates from real-world applications where \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cinar:2021:IJB, author = "Yusuf Cinar and Peter Pocta and Desmond Chambers and Hugh Melvin", title = "Improved Jitter Buffer Management for {WebRTC}", journal = j-TOMM, volume = "17", number = "1", pages = "30:1--30:20", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410449", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3410449", abstract = "This work studies the jitter buffer management algorithm for Voice over IP in WebRTC. In particular, it details the core concepts of WebRTC's jitter buffer management. Furthermore, it investigates how jitter buffer management algorithm behaves under \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Czekierda:2021:AOO, author = "Lukasz Czekierda and Krzysztof Zieli{\'n}ski and S{\l}awomir Zieli{\'n}ski", title = "Automated Orchestration of Online Educational Collaboration in Cloud-based Environments", journal = j-TOMM, volume = "17", number = "1", pages = "31:1--31:26", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3412381", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3412381", abstract = "Integrated collaboration environments (ICEs) are widely used by corporations to increase productivity by fostering groupwide and interpersonal collaboration. In this article, we discuss the enhancements of such environment needed to build an educational \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kieu:2021:BLD, author = "My Kieu and Andrew D. Bagdanov and Marco Bertini", title = "Bottom-up and Layerwise Domain Adaptation for Pedestrian Detection in Thermal Images", journal = j-TOMM, volume = "17", number = "1", pages = "32:1--32:19", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418213", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3418213", abstract = "Pedestrian detection is a canonical problem for safety and security applications, and it remains a challenging problem due to the highly variable lighting conditions in which pedestrians must be detected. This article investigates several domain \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:MIH, author = "Wenjie Wang and Ling-Yu Duan and Hao Jiang and Peiguang Jing and Xuemeng Song and Liqiang Nie", title = "{Market$2$Dish}: Health-aware Food Recommendation", journal = j-TOMM, volume = "17", number = "1", pages = "33:1--33:19", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418211", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3418211", abstract = "With the rising incidence of some diseases, such as obesity and diabetes, the healthy diet is arousing increasing attention. However, most existing food-related research efforts focus on recipe retrieval, user-preference-based food recommendation, cooking \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:ADA, author = "Yiding Liu and Siyu Yang and Bin Li and Wengang Zhou and Jizheng Xu and Houqiang Li and Yan Lu", title = "Affinity Derivation for Accurate Instance Segmentation", journal = j-TOMM, volume = "17", number = "1", pages = "34:1--34:20", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3407090", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3407090", abstract = "Affinity, which represents whether two pixels belong to a same instance, is an equivalent representation to the instance segmentation labels. Conventional works do not make an explicit exploration on the affinity. In this article, we present two instance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2021:CLG, author = "Yi Yu and Abhishek Srivastava and Simon Canales", title = "Conditional {LSTM-GAN} for Melody Generation from Lyrics", journal = j-TOMM, volume = "17", number = "1", pages = "35:1--35:20", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3424116", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3424116", abstract = "Melody generation from lyrics has been a challenging research issue in the field of artificial intelligence and music, which enables us to learn and discover latent relationships between interesting lyrics and accompanying melodies. Unfortunately, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2021:AWE, author = "Xin Yang and Xuemeng Song and Fuli Feng and Haokun Wen and Ling-Yu Duan and Liqiang Nie", title = "Attribute-wise Explainable Fashion Compatibility Modeling", journal = j-TOMM, volume = "17", number = "1", pages = "36:1--36:21", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3425636", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3425636", abstract = "With the boom of the fashion market and people's daily needs for beauty, clothing matching has gained increased research attention. In a sense, tackling this problem lies in modeling the human notions of the compatibility between fashion items, i.e., \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:SSL, author = "Zhixin Li and Lan Lin and Canlong Zhang and Huifang Ma and Weizhong Zhao and Zhiping Shi", title = "A Semi-supervised Learning Approach Based on Adaptive Weighted Fusion for Automatic Image Annotation", journal = j-TOMM, volume = "17", number = "1", pages = "37:1--37:23", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3426974", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3426974", abstract = "To learn a well-performed image annotation model, a large number of labeled samples are usually required. Although the unlabeled samples are readily available and abundant, it is a difficult task for humans to annotate large numbers of images manually. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:DVV, author = "Yanwei Liu and Jinxia Liu and Antonios Argyriou and Siwei Ma and Liming Wang and Zhen Xu", title = "$ 360$-Degree {VR} Video Watermarking Based on Spherical Wavelet Transform", journal = j-TOMM, volume = "17", number = "1", pages = "38:1--38:23", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3425605", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:40:21 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3425605", abstract = "Similar to conventional video, the increasingly popular 360 virtual reality (VR) video requires copyright protection mechanisms. The classic approach for copyright protection is the introduction of a digital watermark into the video sequence. Due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:IBM, author = "Yang Wang and Meng Fang and Joey Tianyi Zhou and Tingting Mu and Dacheng Tao", title = "Introduction to Big Multimodal Multimedia Data with Deep Analytics", journal = j-TOMM, volume = "17", number = "1s", pages = "1:1--1:3", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447530", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447530", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:ZSC, author = "Xing Xu and Jialin Tian and Kaiyi Lin and Huimin Lu and Jie Shao and Heng Tao Shen", title = "Zero-shot Cross-modal Retrieval by Assembling {AutoEncoder} and Generative Adversarial Network", journal = j-TOMM, volume = "17", number = "1s", pages = "3:1--3:17", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3424341", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3424341", abstract = "Conventional cross-modal retrieval models mainly assume the same scope of the classes for both the training set and the testing set. This assumption limits their extensibility on zero-shot cross-modal retrieval (ZS-CMR), \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2021:DGL, author = "Sichao Fu and Weifeng Liu and Weili Guan and Yicong Zhou and Dapeng Tao and Changsheng Xu", title = "Dynamic Graph Learning Convolutional Networks for Semi-supervised Classification", journal = j-TOMM, volume = "17", number = "1s", pages = "4:1--4:13", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3412846", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3412846", abstract = "Over the past few years, graph representation learning (GRL) has received widespread attention on the feature representations of the non-Euclidean data. As a typical model of GRL, graph convolutional networks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:DNP, author = "Zhao Zhang and Jiahuan Ren and Haijun Zhang and Zheng Zhang and Guangcan Liu and Shuicheng Yan", title = "{DLRF-Net}: a Progressive Deep Latent Low-Rank Fusion Network for Hierarchical Subspace Discovery", journal = j-TOMM, volume = "17", number = "1s", pages = "5:1--5:24", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3402030", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3402030", abstract = "Low-rank coding-based representation learning is powerful for discovering and recovering the subspace structures in data, which has obtained an impressive performance; however, it still cannot obtain deep hidden \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:GMM, author = "Yi Zhang and Miaomiao Li and Siwei Wang and Sisi Dai and Lei Luo and En Zhu and Huiying Xu and Xinzhong Zhu and Chaoyun Yao and Haoran Zhou", title = "{Gaussian} Mixture Model Clustering with Incomplete Data", journal = j-TOMM, volume = "17", number = "1s", pages = "6:1--6:14", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408318", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408318", abstract = "Gaussian mixture model (GMM) clustering has been extensively studied due to its effectiveness and efficiency. Though demonstrating promising performance in various applications, it cannot effectively address the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:ROR, author = "Jing Zhang and Jiaqi Guo and Yonggong Ren", title = "Robust Ordinal Regression: User Credit Grading with Triplet Loss-Based Sampling", journal = j-TOMM, volume = "17", number = "1s", pages = "7:1--7:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408303", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408303", abstract = "With the development of social media sites, user credit grading, which served as an important and fashionable problem, has attracted substantial attention from a slew of developers and operators of mobile applications. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:EIE, author = "Xin Xu and Shiqin Wang and Zheng Wang and Xiaolong Zhang and Ruimin Hu", title = "Exploring Image Enhancement for Salient Object Detection in Low Light Images", journal = j-TOMM, volume = "17", number = "1s", pages = "8:1--8:19", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3414839", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3414839", abstract = "Low light images captured in a non-uniform illumination environment usually are degraded with the scene depth and the corresponding environment lights. This degradation results in severe object information loss in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:LSI, author = "Yanchun Li and Jianglian Cao and Zhetao Li and Sangyoon Oh and Nobuyoshi Komuro", title = "Lightweight Single Image Super-resolution with Dense Connection Distillation Network", journal = j-TOMM, volume = "17", number = "1s", pages = "9:1--9:17", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3414838", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3414838", abstract = "Single image super-resolution attempts to reconstruct a high-resolution (HR) image from its corresponding low-resolution (LR) image, which has been a research hotspot in computer vision and image processing for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:SDM, author = "Yang Wang", title = "Survey on Deep Multi-modal Data Analytics: Collaboration, Rivalry, and Fusion", journal = j-TOMM, volume = "17", number = "1s", pages = "10:1--10:25", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408317", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408317", abstract = "With the development of web technology, multi-modal or multi-view data has surged as a major stream for big data, where each modal/view encodes individual property of data objects. Often, different modalities are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:ISI, author = "Yang Wang and Meng Fang and Joey Tianyi Zhou and Tingting Mu and Dacheng Tao", title = "Introduction to the Special Issue on Fine-grained Visual Computing", journal = j-TOMM, volume = "17", number = "1s", pages = "11:1--11:3", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447532", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447532", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2021:AEN, author = "Yutao Hu and Xuhui Liu and Baochang Zhang and Jungong Han and Xianbin Cao", title = "Alignment Enhancement Network for Fine-grained Visual Categorization", journal = j-TOMM, volume = "17", number = "1s", pages = "12:1--12:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446208", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446208", abstract = "Fine-grained visual categorization (FGVC) aims to automatically recognize objects from different sub-ordinate categories. Despite attracting considerable attention from both academia and industry, it remains a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guan:2021:UPS, author = "Weili Guan and Zhaozheng Chen and Fuli Feng and Weifeng Liu and Liqiang Nie", title = "Urban Perception: Sensing Cities via a Deep Interactive Multi-task Learning Framework", journal = j-TOMM, volume = "17", number = "1s", pages = "13:1--13:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3424115", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3424115", abstract = "Social scientists have shown evidence that visual perceptions of urban attributes, such as safe, wealthy, and beautiful perspectives of the given cities, are highly correlated to the residents' behaviors and quality \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lu:2021:CIC, author = "Huimin Lu and Rui Yang and Zhenrong Deng and Yonglin Zhang and Guangwei Gao and Rushi Lan", title = "{Chinese} Image Captioning via Fuzzy Attention-based {DenseNet-BiLSTM}", journal = j-TOMM, volume = "17", number = "1s", pages = "14:1--14:18", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3422668", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3422668", abstract = "Chinese image description generation tasks usually have some challenges, such as single-feature extraction, lack of global information, and lack of detailed description of the image content. To address these limitations, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2021:WSS, author = "Junsheng Xiao and Huahu Xu and Honghao Gao and Minjie Bian and Yang Li", title = "A Weakly Supervised Semantic Segmentation Network by Aggregating Seed Cues: The Multi-Object Proposal Generation Perspective", journal = j-TOMM, volume = "17", number = "1s", pages = "15:1--15:19", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3419842", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3419842", abstract = "Weakly supervised semantic segmentation under image-level annotations is effectiveness for real-world applications. The small and sparse discriminative regions obtained from an image classification network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:RMR, author = "Chao Zhang and Xiaopei Wu and Jianchao Lu and Xi Zheng and Alireza Jolfaei and Quan Z. Sheng and Dongjin Yu", title = "{RICA-MD}: a Refined {ICA} Algorithm for Motion Detection", journal = j-TOMM, volume = "17", number = "1s", pages = "17:1--17:17", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3416492", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3416492", abstract = "With the rapid development of various computing technologies, the constraints of data processing capabilities gradually disappeared, and more data can be simultaneously processed to obtain better \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rahman:2021:MMP, author = "MD Abdur Rahman and M. Shamim Hossain and Nabil A. Alrajeh and B. B. Gupta", title = "A Multimodal, Multimedia Point-of-Care Deep Learning Framework for {COVID-19} Diagnosis", journal = j-TOMM, volume = "17", number = "1s", pages = "18:1--18:24", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3421725", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3421725", abstract = "In this article, we share our experiences in designing and developing a suite of deep neural network-(DNN) based COVID-19 case detection and recognition framework. Existing pathological tests such as RT-PCR-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:SFF, author = "Yidong Li and Wenhua Liu and Yi Jin and Yuanzhouhan Cao", title = "{SPGAN}: Face Forgery Using Spoofing Generative Adversarial Networks", journal = j-TOMM, volume = "17", number = "1s", pages = "19:1--19:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3432817", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3432817", abstract = "Current face spoof detection schemes mainly rely on physiological cues such as eye blinking, mouth movements, and micro-expression changes, or textural attributes of the face images [9]. But none of these methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qi:2021:CAW, author = "Lianyong Qi and Houbing Song and Xuyun Zhang and Gautam Srivastava and Xiaolong Xu and Shui Yu", title = "Compatibility-Aware {Web} {API} Recommendation for Mashup Creation via Textual Description Mining", journal = j-TOMM, volume = "17", number = "1s", pages = "20:1--20:19", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3417293", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Apr 17 08:50:01 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3417293", abstract = "With the ever-increasing prosperity of web Application Programming Interface (API) sharing platforms, it is becoming an economic and efficient way for software developers to design their interested mashups \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Krishnan:2021:SEQ, author = "Prabhakar Krishnan and Kurunandan Jain and Pramod George Jose and Krishnashree Achuthan and Rajkumar Buyya", title = "{SDN} Enabled {QoE} and Security Framework for Multimedia Applications in {5G} Networks", journal = j-TOMM, volume = "17", number = "2", pages = "39:1--39:29", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3377390", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3377390", abstract = "The technologies for real-time multimedia transmission and immersive 3D gaming applications are rapidly emerging, posing challenges in terms of performance, security, authentication, data privacy, and encoding. The communication channel for these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kumar:2021:ESE, author = "S. Sambath Kumar and M. Nandhini", title = "Entropy Slicing Extraction and Transfer Learning Classification for Early Diagnosis of {Alzheimer} Diseases with {sMRI}", journal = j-TOMM, volume = "17", number = "2", pages = "40:1--40:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3383749", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3383749", abstract = "Alzheimer's Disease (AD) is an irreversible neurogenerative disorder that undergoes progressive decline in memory and cognitive function and is characterized by structural brain Magnetic Resonance Images (sMRI). In recent years, sMRI data has played a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:TTF, author = "Xiaolong Xu and Zijie Fang and Lianyong Qi and Xuyun Zhang and Qiang He and Xiaokang Zhou", title = "{TripRes}: Traffic Flow Prediction Driven Resource Reservation for Multimedia {IoV} with Edge Computing", journal = j-TOMM, volume = "17", number = "2", pages = "41:1--41:21", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3401979", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3401979", abstract = "The Internet of Vehicles (IoV) connects vehicles, roadside units (RSUs) and other intelligent objects, enabling data sharing among them, thereby improving the efficiency of urban traffic and safety. Currently, collections of multimedia content, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2021:FDI, author = "Wei Liang and Jing Long and Kuan-Ching Li and Jianbo Xu and Nanjun Ma and Xia Lei", title = "A Fast Defogging Image Recognition Algorithm Based on Bilateral Hybrid Filtering", journal = j-TOMM, volume = "17", number = "2", pages = "42:1--42:16", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3391297", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3391297", abstract = "With the rapid advancement of video and image processing technologies in the Internet of Things, it is urgent to address the issues in real-time performance, clarity, and reliability of image recognition technology for a monitoring system in foggy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tong:2021:IPP, author = "Chao Tong and Mengze Zhang and Chao Lang and Zhigao Zheng", title = "An Image Privacy Protection Algorithm Based on Adversarial Perturbation Generative Networks", journal = j-TOMM, volume = "17", number = "2", pages = "43:1--43:14", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3381088", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3381088", abstract = "Today, users of social platforms upload a large number of photos. These photos contain personal private information, including user identity information, which is easily gleaned by intelligent detection algorithms. To thwart this, in this work, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2021:FAA, author = "Yunfei Fu and Hongchuan Yu and Chih-Kuo Yeh and Tong-Yee Lee and Jian J. Zhang", title = "Fast Accurate and Automatic Brushstroke Extraction", journal = j-TOMM, volume = "17", number = "2", pages = "44:1--44:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429742", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3429742", abstract = "Brushstrokes are viewed as the artist's ``handwriting'' in a painting. In many applications such as style learning and transfer, mimicking painting, and painting authentication, it is highly desired to quantitatively and accurately identify brushstroke \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{K:2021:AML, author = "Mythili K. and Manish Narwaria", title = "Assessment of Machine Learning-Based Audiovisual Quality Predictors: Why Uncertainty Matters", journal = j-TOMM, volume = "17", number = "2", pages = "45:1--45:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3430376", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3430376", abstract = "Quality assessment of audiovisual (AV) signals is important from the perspective of system design, optimization, and management of a modern multimedia communication system. However, automatic prediction of AV quality via the use of computational models \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hama:2021:EUM, author = "Kenta Hama and Takashi Matsubara and Kuniaki Uehara and Jianfei Cai", title = "Exploring Uncertainty Measures for Image-caption Embedding-and-retrieval Task", journal = j-TOMM, volume = "17", number = "2", pages = "46:1--46:19", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3425663", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3425663", abstract = "With the significant development of black-box machine learning algorithms, particularly deep neural networks, the practical demand for reliability assessment is rapidly increasing. On the basis of the concept that ``Bayesian deep learning knows what it \ldots{}''", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nguyen:2021:ISV, author = "Phuong-Anh Nguyen and Chong-Wah Ngo", title = "Interactive Search vs. Automatic Search: an Extensive Study on Video Retrieval", journal = j-TOMM, volume = "17", number = "2", pages = "47:1--47:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429457", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3429457", abstract = "This article conducts user evaluation to study the performance difference between interactive and automatic search. Particularly, the study aims to provide empirical insights of how the performance landscape of video search changes, with tens of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:TAE, author = "Yang Li and Guangcan Liu and Yubao Sun and Qingshan Liu and Shengyong Chen", title = "{$3$D} Tensor Auto-encoder with Application to Video Compression", journal = j-TOMM, volume = "17", number = "2", pages = "48:1--48:18", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3431768", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3431768", abstract = "Auto-encoder has been widely used to compress high-dimensional data such as the images and videos. However, the traditional auto-encoder network needs to store a large number of parameters. Namely, when the input data is of dimension n, the number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mehrabi:2021:MTC, author = "Abbas Mehrabi and Matti Siekkinen and Teemu K{\"a}m{\"a}r{\"a}inen and Antti Yl{\"a}-J{\"a}{\"a}ski", title = "Multi-Tier {CloudVR}: Leveraging Edge Computing in Remote Rendered Virtual Reality", journal = j-TOMM, volume = "17", number = "2", pages = "49:1--49:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429441", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3429441", abstract = "The availability of high bandwidth with low-latency communication in 5G mobile networks enables remote rendered real-time virtual reality (VR) applications. Remote rendering of VR graphics in a cloud removes the need for local personal computer for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2021:ARO, author = "Lu Sun and Hussein {Al Osman} and Jochen Lang", title = "An Augmented Reality Online Assistance Platform for Repair Tasks", journal = j-TOMM, volume = "17", number = "2", pages = "50:1--50:23", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429285", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3429285", abstract = "Our augmented reality online assistance platform enables an expert to specify 6DoF movements of a component and apply the geometrical and physical constraints in real-time. We track the real components on the expert's side to monitor the operations of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2021:SAM, author = "Meiqi Zhao and Jianmin Zheng and Elvis S. Liu", title = "Server Allocation for Massively Multiplayer Online Cloud Games Using Evolutionary Optimization", journal = j-TOMM, volume = "17", number = "2", pages = "51:1--51:23", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433027", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3433027", abstract = "In recent years, Massively Multiplayer Online Games (MMOGs) are becoming popular, partially due to their sophisticated graphics and broad virtual world, and cloud gaming is demanded more than ever especially when entertaining with light and portable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2021:ISS, author = "Haiyang Wei and Zhixin Li and Feicheng Huang and Canlong Zhang and Huifang Ma and Zhongzhi Shi", title = "Integrating Scene Semantic Knowledge into Image Captioning", journal = j-TOMM, volume = "17", number = "2", pages = "52:1--52:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3439734", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3439734", abstract = "Most existing image captioning methods use only the visual information of the image to guide the generation of captions, lack the guidance of effective scene semantic information, and the current visual attention mechanism cannot adjust the focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gupta:2021:VSB, author = "Shikha Gupta and Krishan Sharma and Dileep Aroor Dinesh and Veena Thenkanidiyoor", title = "Visual Semantic-Based Representation Learning Using Deep {CNNs} for Scene Recognition", journal = j-TOMM, volume = "17", number = "2", pages = "53:1--53:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436494", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3436494", abstract = "In this work, we address the task of scene recognition from image data. A scene is a spatially correlated arrangement of various visual semantic contents also known as concepts, e.g., ``chair,'' ``car,'' ``sky,'' etc. Representation learning using visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2021:PCN, author = "Chun-ying Huang and Yun-chen Cheng and Guan-zhang Huang and Ching-ling Fan and Cheng-hsin Hsu", title = "On the Performance Comparisons of Native and Clientless Real-Time Screen-Sharing Technologies", journal = j-TOMM, volume = "17", number = "2", pages = "54:1--54:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3437881", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3437881", abstract = "Real-time screen-sharing provides users with ubiquitous access to remote applications, such as computer games, movie players, and desktop applications (apps), anywhere and anytime. In this article, we study the performance of different screen-sharing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2021:ACG, author = "Xin Yang and Zongliang Ma and Letian Yu and Ying Cao and Baocai Yin and Xiaopeng Wei and Qiang Zhang and Rynson W. H. Lau", title = "Automatic Comic Generation with Stylistic Multi-page Layouts and Emotion-driven Text Balloon Generation", journal = j-TOMM, volume = "17", number = "2", pages = "55:1--55:19", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3440053", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3440053", abstract = "In this article, we propose a fully automatic system for generating comic books from videos without any human intervention. Given an input video along with its subtitles, our approach first extracts informative keyframes by analyzing the subtitles and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sharma:2021:HQF, author = "Prasen Kumar Sharma and Sujoy Ghosh and Arijit Sur", title = "High-quality Frame Recurrent Video De-raining with Multi-contextual Adversarial Network", journal = j-TOMM, volume = "17", number = "2", pages = "56:1--56:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3444974", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3444974", abstract = "In this article, we address the problem of rain-streak removal in the videos. Unlike the image, challenges in video restoration comprise temporal consistency besides spatial enhancement. The researchers across the world have proposed several effective \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lan:2021:STR, author = "Xiangyuan Lan and Zifei Yang and Wei Zhang and Pong C. Yuen", title = "Spatial-temporal Regularized Multi-modality Correlation Filters for Tracking with Re-detection", journal = j-TOMM, volume = "17", number = "2", pages = "57:1--57:16", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3430257", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 5 07:35:45 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3430257", abstract = "The development of multi-spectrum image sensing technology has brought great interest in exploiting the information of multiple modalities (e.g., RGB and infrared modalities) for solving computer vision problems. In this article, we investigate how to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2021:ISI, author = "Amit Kumar Singh and Zhihan Lv and Hoon Ko", title = "Introduction to the Special Issue on {Recent Trends in Medical Data Security for e-Health Applications}", journal = j-TOMM, volume = "17", number = "2s", pages = "58:1--58:3", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3459601", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3459601", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2021:SHD, author = "A. K. Singh and A. Anand and Z. Lv and H. Ko and A. Mohan", title = "A Survey on Healthcare Data: a Security Perspective", journal = j-TOMM, volume = "17", number = "2s", pages = "59:1--59:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3422816", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3422816", abstract = "With the remarkable development of internet technologies, the popularity of smart healthcare has regularly come to the fore. Smart healthcare uses advanced technologies to transform the traditional medical system in an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2021:SPP, author = "Hongjiao Wu and Ashutosh Dhar Dwivedi and Gautam Srivastava", title = "Security and Privacy of Patient Information in Medical Systems Based on Blockchain Technology", journal = j-TOMM, volume = "17", number = "2s", pages = "60:1--60:17", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3408321", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3408321", abstract = "The essence of ``blockchain'' is a shared database in which information stored is un-falsifiable, traceable, open, and transparent. Therefore, to improve the security of private information in medical systems, this article \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:OBR, author = "Ting Wang and Xiangjun Ji and Aiguo Song and Kurosh Madani and Amine Chohra and Huimin Lu and Ramon Monero", title = "Output-Bounded and {RBFNN}-Based Position Tracking and Adaptive Force Control for Security Tele-Surgery", journal = j-TOMM, volume = "17", number = "2s", pages = "61:1--61:15", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3394920", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3394920", abstract = "In security e-health brain neurosurgery, one of the important processes is to move the electrocoagulation to the appropriate position in order to excavate the diseased tissue.$^1$ However, it has been problematic for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alkhariji:2021:SPD, author = "Lamya Alkhariji and Nada Alhirabi and Mansour Naser Alraja and Mahmoud Barhamgi and Omer Rana and Charith Perera", title = "Synthesising Privacy by Design Knowledge Toward Explainable {Internet of Things} Application Designing in Healthcare", journal = j-TOMM, volume = "17", number = "2s", pages = "62:1--62:29", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3434186", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3434186", abstract = "Privacy by Design (PbD) is the most common approach followed by software developers who aim to reduce risks within their application designs, yet it remains commonplace for developers to retain little \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tanveer:2021:PLT, author = "M. Tanveer and Tarun Gupta and Miten Shah and {For the Alzheimer's Disease Neuroimaging Initiative}", title = "Pinball Loss Twin Support Vector Clustering", journal = j-TOMM, volume = "17", number = "2s", pages = "63:1--63:23", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3409264", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3409264", abstract = "Twin Support Vector Clustering (TWSVC) is a clustering algorithm inspired by the principles of Twin Support Vector Machine (TWSVM). TWSVC has already outperformed other traditional plane based clustering algorithms. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sahu:2021:LMP, author = "Amiya Kumar Sahu and Suraj Sharma and Deepak Puthal", title = "Lightweight Multi-party Authentication and Key Agreement Protocol in {IoT}-based E-Healthcare Service", journal = j-TOMM, volume = "17", number = "2s", pages = "64:1--64:20", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3398039", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3398039", abstract = "Internet of Things (IoT) is playing a promising role in e-healthcare applications in the recent decades; nevertheless, security is one of the crucial challenges in the current field of study. Many healthcare devices \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rajput:2021:SBS, author = "Amitesh Singh Rajput and Vishesh Kumar Tanwar and Balasubramanian Raman", title = "-Score-Based Secure Biomedical Model for Effective Skin Lesion Segmentation Over {eHealth} Cloud", journal = j-TOMM, volume = "17", number = "2s", pages = "65:1--65:19", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3430806", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3430806", abstract = "This study aims to process the private medical data over eHealth cloud platform. The current pandemic situation, caused by Covid19 has made us to realize the importance of automatic remotely operated independent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2021:EEB, author = "Ashima Singh and Arwinder Dhillon and Neeraj Kumar and M. Shamim Hossain and Ghulam Muhammad and Manoj Kumar", title = "{eDiaPredict}: an Ensemble-based Framework for Diabetes Prediction", journal = j-TOMM, volume = "17", number = "2s", pages = "66:1--66:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3415155", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3415155", abstract = "Medical systems incorporate modern computational intelligence in healthcare. Machine learning techniques are applied to predict the onset and reoccurrence of the disease, identify biomarkers for survivability \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Amato:2021:SPV, author = "Flora Amato and Valentina Casola and Giovanni Cozzolino and Alessandra {De Benedictis} and Nicola Mazzocca and Francesco Moscato", title = "A Security and Privacy Validation Methodology for e-Health Systems", journal = j-TOMM, volume = "17", number = "2s", pages = "67:1--67:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3412373", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3412373", abstract = "e-Health applications enable one to acquire, process, and share patient medical data to improve diagnosis, treatment, and patient monitoring. Despite the undeniable benefits brought by the digitization of health systems, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kasyap:2021:PPD, author = "Harsh Kasyap and Somanath Tripathy", title = "Privacy-preserving Decentralized Learning Framework for Healthcare System", journal = j-TOMM, volume = "17", number = "2s", pages = "68:1--68:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3426474", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3426474", abstract = "Clinical trials and drug discovery would not be effective without the collaboration of institutions. Earlier, it has been at the cost of individual's privacy. Several pacts and compliances have been enforced to avoid data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shamsolmoali:2021:ISI, author = "Pourya Shamsolmoali and Ruili Wang and A. H. Sadka", title = "Introduction to the Special Issue on {Advanced Approaches for Multiple Instance Learning on Multimedia Applications}", journal = j-TOMM, volume = "17", number = "2s", pages = "69:1--69:2", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3459603", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3459603", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ji:2021:MPG, author = "Ruyi Ji and Zeyu Liu and Libo Zhang and Jianwei Liu and Xin Zuo and Yanjun Wu and Chen Zhao and Haofeng Wang and Lin Yang", title = "Multi-peak Graph-based Multi-instance Learning for Weakly Supervised Object Detection", journal = j-TOMM, volume = "17", number = "2s", pages = "70:1--70:21", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3432861", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3432861", abstract = "Weakly supervised object detection (WSOD), aiming to detect objects with only image-level annotations, has become one of the research hotspots over the past few years. Recently, much effort has been devoted to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ding:2021:MSA, author = "Yaoling Ding and Liehuang Zhu and An Wang and Yuan Li and Yongjuan Wang and Siu Ming Yiu and Keke Gai", title = "A Multiple Sieve Approach Based on Artificial Intelligent Techniques and Correlation Power Analysis", journal = j-TOMM, volume = "17", number = "2s", pages = "71:1--71:21", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433165", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3433165", abstract = "Side-channel analysis achieves key recovery by analyzing physical signals generated during the operation of cryptographic devices. Power consumption is one kind of these signals and can be regarded as a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ji:2021:MIM, author = "Wanting Ji and Ruili Wang", title = "A Multi-instance Multi-label Dual Learning Approach for Video Captioning", journal = j-TOMM, volume = "17", number = "2s", pages = "72:1--72:18", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446792", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446792", abstract = "Video captioning is a challenging task in the field of multimedia processing, which aims to generate informative natural language descriptions/captions to describe video contents. Previous video captioning approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zareapoor:2021:EAN, author = "Masoumeh Zareapoor and Jie Yang", title = "Equivariant Adversarial Network for Image-to-image Translation", journal = j-TOMM, volume = "17", number = "2s", pages = "73:1--73:14", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458280", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458280", abstract = "Image-to-Image translation aims to learn an image from a source domain to a target domain. However, there are three main challenges, such as lack of paired datasets, multimodality, and diversity, that are associated \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mohammed:2021:MAF, author = "Mazin Abed Mohammed and Mohamed Elhoseny and Karrar Hameed Abdulkareem and Salama A. Mostafa and Mashael S. Maashi", title = "A Multi-agent Feature Selection and Hybrid Classification Model for {Parkinson}'s Disease Diagnosis", journal = j-TOMM, volume = "17", number = "2s", pages = "74:1--74:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433180", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3433180", abstract = "Parkinson's disease (PD) diagnostics includes numerous analyses related to the neurological, physical, and psychical status of the patient. Medical teams analyze multiple symptoms and patient history considering \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{An:2021:MTU, author = "Na An and Wei Qi Yan", title = "Multitarget Tracking Using {Siamese} Neural Networks", journal = j-TOMM, volume = "17", number = "2s", pages = "75:1--75:16", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3441656", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3441656", abstract = "In this article, we detect and track visual objects by using Siamese network or twin neural network. The Siamese network is constructed to classify moving objects based on the associations of object detection network and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2021:MCA, author = "Xiaochuan Tang and Mingzhe Liu and Hao Zhong and Yuanzhen Ju and Weile Li and Qiang Xu", title = "{MILL}: Channel Attention-based Deep Multiple Instance Learning for Landslide Recognition", journal = j-TOMM, volume = "17", number = "2s", pages = "76:1--76:11", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3454009", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jun 22 08:33:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3454009", abstract = "Landslide recognition is widely used in natural disaster risk management. Traditional landslide recognition is mainly conducted by geologists, which is accurate but inefficient. This article introduces multiple instance learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:NNB, author = "Yue Li and Yan Yi and Dong Liu and Li Li and Zhu Li and Houqiang Li", title = "Neural-Network-Based Cross-Channel Intra Prediction", journal = j-TOMM, volume = "17", number = "3", pages = "77:1--77:23", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3434250", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3434250", abstract = "To reduce the redundancy among different color channels, e.g., YUV, previous methods usually adopt a linear model that tends to be oversimple for complex image content. We propose a neural-network-based method for cross-channel prediction in intra frame \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:MML, author = "Zhandong Liu and Wengang Zhou and Houqiang Li", title = "{MFECN}: Multi-level Feature Enhanced Cumulative Network for Scene Text Detection", journal = j-TOMM, volume = "17", number = "3", pages = "78:1--78:22", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3440087", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3440087", abstract = "Recently, many scene text detection algorithms have achieved impressive performance by using convolutional neural networks. However, most of them do not make full use of the context among the hierarchical multi-level features to improve the performance of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dong:2021:SCL, author = "Xingbo Dong and Soohyong Kim and Zhe Jin and Jung Yeon Hwang and Sangrae Cho and Andrew Beng Jin Teoh", title = "Secure Chaff-less Fuzzy Vault for Face Identification Systems", journal = j-TOMM, volume = "17", number = "3", pages = "79:1--79:22", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3442198", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3442198", abstract = "Biometric cryptosystems such as fuzzy vaults represent one of the most popular approaches for secret and biometric template protection. However, they are solely designed for biometric verification, where the user is required to input both identity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2021:GLE, author = "Hezhen Hu and Wengang Zhou and Junfu Pu and Houqiang Li", title = "Global-Local Enhancement Network for {NMF}-Aware Sign Language Recognition", journal = j-TOMM, volume = "17", number = "3", pages = "80:1--80:19", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436754", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3436754", abstract = "Sign language recognition (SLR) is a challenging problem, involving complex manual features (i.e., hand gestures) and fine-grained non-manual features (NMFs) (i.e., facial expression, mouth shapes, etc.). Although manual features are dominant, non-manual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2021:RRN, author = "Feng Lin and Wengang Zhou and Jiajun Deng and Bin Li and Yan Lu and Houqiang Li", title = "Residual Refinement Network with Attribute Guidance for Precise Saliency Detection", journal = j-TOMM, volume = "17", number = "3", pages = "81:1--81:19", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3440694", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3440694", abstract = "As an important topic in the multimedia and computer vision fields, salient object detection has been researched for years. Recently, state-of-the-art performance has been witnessed with the aid of the fully convolutional networks (FCNs) and the various \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2021:IIR, author = "Hongdi Zheng and Junfeng Wang and Jianping Zhang and Ruirui Li", title = "{IRTS}: an Intelligent and Reliable Transmission Scheme for Screen Updates Delivery in {DaaS}", journal = j-TOMM, volume = "17", number = "3", pages = "82:1--82:24", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3440035", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3440035", abstract = "Desktop-as-a-service (DaaS) has been recognized as an elastic and economical solution that enables users to access personal desktops from anywhere at any time. During the interaction process of DaaS, users rely on screen updates to perceive execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:SCG, author = "Rui Wang and Dong Liang and Xiaochun Cao and Yuanfang Guo", title = "Semantic Correspondence with Geometric Structure Analysis", journal = j-TOMM, volume = "17", number = "3", pages = "83:1--83:21", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3441576", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3441576", abstract = "This article studies the correspondence problem for semantically similar images, which is challenging due to the joint visual and geometric deformations. We introduce the Flip-aware Distance Ratio method (FDR) to solve this problem from the perspective of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:SSS, author = "Xinfang Liu and Xiushan Nie and Junya Teng and Li Lian and Yilong Yin", title = "Single-shot Semantic Matching Network for Moment Localization in Videos", journal = j-TOMM, volume = "17", number = "3", pages = "84:1--84:14", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3441577", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3441577", abstract = "Moment localization in videos using natural language refers to finding the most relevant segment from videos given a natural language query. Most of the existing methods require video segment candidates for further matching with the query, which leads to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alaya:2021:PBD, author = "Bechir Alaya", title = "Payoff-based Dynamic Segment Replication and Graph Classification Method with Attribute Vectors Adapted to Urban {VANET}", journal = j-TOMM, volume = "17", number = "3", pages = "85:1--85:22", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3440018", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3440018", abstract = "Due to the number of constraints and the dynamic nature of vehicular ad hoc networks (VANET), effective video broadcasting always remains a difficult task. In this work, we proposed a quality of video visualization guarantee model based on a feedback loop \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dhiman:2021:PWS, author = "Chhavi Dhiman and Dinesh Kumar Vishwakarma and Paras Agarwal", title = "Part-wise Spatio-temporal Attention Driven {CNN}-based {$3$D} Human Action Recognition", journal = j-TOMM, volume = "17", number = "3", pages = "86:1--86:24", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3441628", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3441628", abstract = "Recently, human activity recognition using skeleton data is increasing due to its ease of acquisition and finer shape details. Still, it suffers from a wide range of intra-class variation, inter-class similarity among the actions and view variation due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nie:2021:PPF, author = "Jie Nie and Zhi-Qiang Wei and Weizhi Nie and An-An Liu", title = "{PGNet}: Progressive Feature Guide Learning Network for Three-dimensional Shape Recognition", journal = j-TOMM, volume = "17", number = "3", pages = "87:1--87:17", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3443708", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3443708", abstract = "Three-dimensional (3D) shape recognition is a popular topic and has potential application value in the field of computer vision. With the recent proliferation of deep learning, various deep learning models have achieved state-of-the-art performance. Among \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:VDB, author = "Shiguang Liu and Huixin Wang and Xiaoli Zhang", title = "Video Decolorization Based on the {CNN} and {LSTM} Neural Network", journal = j-TOMM, volume = "17", number = "3", pages = "88:1--88:18", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446619", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446619", abstract = "Video decolorization is the process of transferring three-channel color videos into single-channel grayscale videos, which is essentially the decolorization operation of video frames. Most existing video decolorization algorithms directly apply image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2021:DCN, author = "Zhenzhen Yang and Pengfei Xu and Yongpeng Yang and Bing-Kun Bao", title = "A Densely Connected Network Based on {U-Net} for Medical Image Segmentation", journal = j-TOMM, volume = "17", number = "3", pages = "89:1--89:14", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446618", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446618", abstract = "The U-Net has become the most popular structure in medical image segmentation in recent years. Although its performance for medical image segmentation is outstanding, a large number of experiments demonstrate that the classical U-Net network architecture \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:LCF, author = "Donglin Zhang and Xiao-Jun Wu and Jun Yu", title = "Label Consistent Flexible Matrix Factorization Hashing for Efficient Cross-modal Retrieval", journal = j-TOMM, volume = "17", number = "3", pages = "90:1--90:18", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446774", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446774", abstract = "Hashing methods have sparked a great revolution on large-scale cross-media search due to its effectiveness and efficiency. Most existing approaches learn unified hash representation in a common Hamming space to represent all multimodal data. However, the unified hash codes may not characterize the cross-modal data discriminatively, because the data may vary greatly due to its different dimensionalities, physical properties, and statistical information. In addition, most existing supervised cross-modal algorithms preserve the similarity relationship by constructing an $ n \time n $ pairwise similarity matrix, which requires a large amount of calculation and loses the category information. To mitigate these issues, a novel cross-media hashing approach is proposed in this article, dubbed label flexible matrix factorization hashing (LFMH). Specifically, LFMH jointly learns the modality-specific latent subspace with similar semantic by the flexible matrix factorization. In addition, LFMH guides the hash learning by utilizing the semantic labels directly instead of the large $ n \times n $ pairwise similarity matrix. LFMH transforms the heterogeneous data into modality-specific latent semantic representation. Therefore, we can obtain the hash codes by quantifying the representations, and the learned hash codes are consistent with the supervised labels of multimodal data. Then, we can obtain the similar binary codes of the corresponding modality, and the binary codes can characterize such samples flexibly. Accordingly, the derived hash codes have more discriminative power for single-modal and cross-modal retrieval tasks. Extensive experiments on eight different databases demonstrate that our model outperforms some competitive approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lokoc:2021:RIS, author = "Jakub Lokoc and Patrik Vesel{\'y} and Frantisek Mejzl{\'\i}k and Gregor Kovalc{\'\i}k and Tom{\'a}s Soucek and Luca Rossetto and Klaus Schoeffmann and Werner Bailer and Cathal Gurrin and Loris Sauter and Jaeyub Song and Stefanos Vrochidis and Jiaxin Wu and Bj{\"o}rn {\thorn}{\'o}R J{\'o}nsson", title = "Is the Reign of Interactive Search Eternal? {Findings} from the {Video Browser Showdown 2020}", journal = j-TOMM, volume = "17", number = "3", pages = "91:1--91:26", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3445031", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3445031", abstract = "Comprehensive and fair performance evaluation of information retrieval systems represents an essential task for the current information age. Whereas Cranfield-based evaluations with benchmark datasets support development of retrieval models, significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:LIR, author = "Qianli Xu and Ana Garcia {Del Molino} and Jie Lin and Fen Fang and Vigneshwaran Subbaraju and Liyuan Li and Joo-Hwee Lim", title = "Lifelog Image Retrieval Based on Semantic Relevance Mapping", journal = j-TOMM, volume = "17", number = "3", pages = "92:1--92:18", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446209", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446209", abstract = "Lifelog analytics is an emerging research area with technologies embracing the latest advances in machine learning, wearable computing, and data analytics. However, state-of-the-art technologies are still inadequate to distill voluminous multimodal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Du:2021:RTE, author = "Gaoming Du and Jiting Wu and Hongfang Cao and Kun Xing and Zhenmin Li and Duoli Zhang and Xiaolei Wang", title = "A Real-Time Effective Fusion-Based Image Defogging Architecture on {FPGA}", journal = j-TOMM, volume = "17", number = "3", pages = "93:1--93:21", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446241", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446241", abstract = "Foggy weather reduces the visibility of photographed objects, causing image distortion and decreasing overall image quality. Many approaches (e.g., image restoration, image enhancement, and fusion-based methods) have been proposed to work out the problem. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2021:FRS, author = "Chenglizhao Chen and Hongmeng Zhao and Huan Yang and Teng Yu and Chong Peng and Hong Qin", title = "Full-reference Screen Content Image Quality Assessment by Fusing Multilevel Structure Similarity", journal = j-TOMM, volume = "17", number = "3", pages = "94:1--94:21", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447393", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447393", abstract = "Screen content images (SCIs) usually comprise various content types with sharp edges, in which artifacts or distortions can be effectively sensed by a vanilla structure similarity measurement in a full-reference manner. Nonetheless, almost all of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:DBS, author = "Honglin Li and Xiaoyang Mao and Mengdi Xu and Xiaogang Jin", title = "Deep-based Self-refined Face-top Coordination", journal = j-TOMM, volume = "17", number = "3", pages = "95:1--95:23", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446970", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3446970", abstract = "Face-top coordination, which exists in most clothes-fitting scenarios, is challenging due to varieties of attributes, implicit correlations, and tradeoffs between general preferences and individual preferences. We present a Deep-Based Self-Refined (DBSR) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2021:DAM, author = "Minxuan Lin and Fan Tang and Weiming Dong and Xiao Li and Changsheng Xu and Chongyang Ma", title = "Distribution Aligned Multimodal and Multi-domain Image Stylization", journal = j-TOMM, volume = "17", number = "3", pages = "96:1--96:17", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3450525", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3450525", abstract = "Multimodal and multi-domain stylization are two important problems in the field of image style transfer. Currently, there are few methods that can perform multimodal and multi-domain stylization simultaneously. In this study, we propose a unified \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Du:2021:IGS, author = "Yong Du and Yangyang Xu and Taizhong Ye and Qiang Wen and Chufeng Xiao and Junyu Dong and Guoqiang Han and Shengfeng He", title = "Invertible Grayscale with Sparsity Enforcing Priors", journal = j-TOMM, volume = "17", number = "3", pages = "97:1--97:17", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451993", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3451993", abstract = "Color dimensionality reduction is believed as a non-invertible process, as re-colorization results in perceptually noticeable and unrecoverable distortion. In this article, we propose to convert a color image into a grayscale image that can fully recover \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qian:2021:KAM, author = "Shengsheng Qian and Jun Hu and Quan Fang and Changsheng Xu", title = "Knowledge-aware Multi-modal Adaptive Graph Convolutional Networks for Fake News Detection", journal = j-TOMM, volume = "17", number = "3", pages = "98:1--98:23", month = aug, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451215", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Aug 19 08:56:09 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3451215", abstract = "In this article, we focus on fake news detection task and aim to automatically identify the fake news from vast amount of social media posts. To date, many approaches have been proposed to detect fake news, which includes traditional learning methods and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:ISI, author = "Yu-Dong Zhang and Juan Manuel Gorriz and Zhengchao Dong", title = "Introduction to the Special Issue on Explainable Deep Learning for Medical Image Computing", journal = j-TOMM, volume = "17", number = "3s", pages = "99:1--99:2", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3485046", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485046", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ni:2021:LCL, author = "Tongguang Ni and Yan Ding and Jing Xue and Kaijian Xia and Xiaoqing Gu and Yizhang Jiang", title = "Local Constraint and Label Embedding Multi-layer Dictionary Learning for Sperm Head Classification", journal = j-TOMM, volume = "17", number = "3s", pages = "100:1--100:16", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458927", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458927", abstract = "Morphological classification of human sperm heads is a key technology for diagnosing male infertility. Due to its sparse representation and learning capability, dictionary learning has shown remarkable performance in human sperm head classification. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2021:DAC, author = "Bingzhi Chen and Yishu Liu and Zheng Zhang and Yingjian Li and Zhao Zhang and Guangming Lu and Hongbing Yu", title = "Deep Active Context Estimation for Automated {COVID-19} Diagnosis", journal = j-TOMM, volume = "17", number = "3s", pages = "101:1--101:22", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457124", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3457124", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2021:MIC, author = "Xiangbin Liu and Jiesheng He and Liping Song and Shuai Liu and Gautam Srivastava", title = "Medical Image Classification based on an Adaptive Size Deep Learning Model", journal = j-TOMM, volume = "17", number = "3s", pages = "102:1--102:18", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3465220", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3465220", abstract = "With the rapid development of Artificial Intelligence (AI), deep learning has increasingly become a research hotspot in various fields, such as medical image classification. Traditional deep learning models use Bilinear Interpolation when processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lu:2021:EFD, author = "Siyuan Lu and Di Wu and Zheng Zhang and Shui-Hua Wang", title = "An Explainable Framework for Diagnosis of {COVID-19} Pneumonia via Transfer Learning and Discriminant Correlation Analysis", journal = j-TOMM, volume = "17", number = "3s", pages = "103:1--103:16", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3449785", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3449785", abstract = "The new coronavirus COVID-19 has been spreading all over the world in the last six months, and the death toll is still rising. The accurate diagnosis of COVID-19 is an emergent task as to stop the spreading of the virus. In this paper, we proposed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alizadehsani:2021:UAS, author = "Roohallah Alizadehsani and Danial Sharifrazi and Navid Hoseini Izadi and Javad Hassannataj Joloudari and Afshin Shoeibi and Juan M. Gorriz and Sadiq Hussain and Juan E. Arco and Zahra Alizadeh Sani and Fahime Khozeimeh and Abbas Khosravi and Saeid Nahavandi and Sheikh Mohammed Shariful Islam and U. Rajendra Acharya", title = "Uncertainty-Aware Semi-Supervised Method Using Large Unlabeled and Limited Labeled {COVID-19} Data", journal = j-TOMM, volume = "17", number = "3s", pages = "104:1--104:24", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3462635", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462635", abstract = "The new coronavirus has caused more than one million deaths and continues to spread rapidly. This virus targets the lungs, causing respiratory distress which can be mild or severe. The X-ray or computed tomography (CT) images of lungs can reveal whether \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kumar:2021:DDE, author = "Ambeshwar Kumar and Ramachandran Manikandan and Utku Kose and Deepak Gupta and Suresh C. Satapathy", title = "Doctor's Dilemma: Evaluating an Explainable Subtractive Spatial Lightweight Convolutional Neural Network for Brain Tumor Diagnosis", journal = j-TOMM, volume = "17", number = "3s", pages = "105:1--105:26", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457187", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3457187", abstract = "In Medicine Deep Learning has become an essential tool to achieve outstanding diagnosis on image data. However, one critical problem is that Deep Learning comes with complicated, black-box models so it is not possible to analyze their trust level \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "105", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Su:2021:HRP, author = "Ge Su and Bo Lin and Wei Luo and Jianwei Yin and Shuiguang Deng and Honghao Gao and Renjun Xu", title = "Hypomimia Recognition in {Parkinson}'s Disease With Semantic Features", journal = j-TOMM, volume = "17", number = "3s", pages = "106:1--106:20", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3476778", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3476778", abstract = "Parkinson's disease is the second most common neurodegenerative disorder, commonly affecting elderly people over the age of 65. As the cardinal manifestation, hypomimia, referred to as impairments in normal facial expressions, stays covert. Even some \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xin:2021:WEG, author = "Qi Xin and Shaohao Hu and Shuaiqi Liu and Ling Zhao and Shuihua Wang", title = "{WTRPNet}: an Explainable Graph Feature Convolutional Neural Network for Epileptic {EEG} Classification", journal = j-TOMM, volume = "17", number = "3s", pages = "107:1--107:18", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460522", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460522", abstract = "As one of the important tools of epilepsy diagnosis, the electroencephalogram (EEG) is noninvasive and presents no traumatic injury to patients. It contains a lot of physiological and pathological information that is easy to obtain. The automatic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cheng:2021:ISI, author = "Wen-Huang Cheng and Jiaying Liu and Nicu Sebe and Junsong Yuan and Hong-Han Shuai", title = "Introduction to the Special Issue on Explainable {AI} on Multimedia Computing", journal = j-TOMM, volume = "17", number = "3s", pages = "108:1--108:2", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3489522", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3489522", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2021:LFS, author = "Jiguo Li and Xinfeng Zhang and Jizheng Xu and Siwei Ma and Wen Gao", title = "Learning to Fool the Speaker Recognition", journal = j-TOMM, volume = "17", number = "3s", pages = "109:1--109:21", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468673", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3468673", abstract = "Due to the widespread deployment of fingerprint/face/speaker recognition systems, the risk in these systems, especially the adversarial attack, has drawn increasing attention in recent years. Previous researches mainly studied the adversarial attack to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2021:PNR, author = "Chenggang Yan and Tong Teng and Yutao Liu and Yongbing Zhang and Haoqian Wang and Xiangyang Ji", title = "Precise No-Reference Image Quality Evaluation Based on Distortion Identification", journal = j-TOMM, volume = "17", number = "3s", pages = "110:1--110:21", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468872", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3468872", abstract = "The difficulty of no-reference image quality assessment (NR IQA) often lies in the lack of knowledge about the distortion in the image, which makes quality assessment blind and thus inefficient. To tackle such issue, in this article, we propose a novel \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2021:EAM, author = "Yung-Yao Chen and Sin-Ye Jhong and Chih-Hsien Hsia and Kai-Lung Hua", title = "Explainable {AI}: a Multispectral Palm-Vein Identification System with New Augmentation Features", journal = j-TOMM, volume = "17", number = "3s", pages = "111:1--111:21", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468873", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3468873", abstract = "Recently, as one of the most promising biometric traits, the vein has attracted the attention of both academia and industry because of its living body identification and the convenience of the acquisition process. State-of-the-art techniques can provide \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2021:XEC, author = "Yu-Sheng Lin and Zhe-Yu Liu and Yu-An Chen and Yu-Siang Wang and Ya-Liang Chang and Winston H. Hsu", title = "{xCos}: an Explainable Cosine Metric for Face Verification Task", journal = j-TOMM, volume = "17", number = "3s", pages = "112:1--112:16", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3469288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3469288", abstract = "We study the XAI (explainable AI) on the face recognition task, particularly the face verification. Face verification has become a crucial task in recent days and it has been deployed to plenty of applications, such as access control, surveillance, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shorfuzzaman:2021:EDL, author = "Mohammad Shorfuzzaman and M. Shamim Hossain and Abdulmotaleb {El Saddik}", title = "An Explainable Deep Learning Ensemble Model for Robust Diagnosis of Diabetic Retinopathy Grading", journal = j-TOMM, volume = "17", number = "3s", pages = "113:1--113:24", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3469841", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3469841", abstract = "Diabetic retinopathy (DR) is one of the most common causes of vision loss in people who have diabetes for a prolonged period. Convolutional neural networks (CNNs) have become increasingly popular for computer-aided DR diagnosis using retinal fundus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2021:BBD, author = "Zhenyu Wu and Zhaowen Wang and Ye Yuan and Jianming Zhang and Zhangyang Wang and Hailin Jin", title = "Black-Box Diagnosis and Calibration on {GAN} Intra-Mode Collapse: a Pilot Study", journal = j-TOMM, volume = "17", number = "3s", pages = "114:1--114:18", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3472768", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472768", abstract = "Generative adversarial networks (GANs) nowadays are capable of producing images of incredible realism. Two concerns raised are whether the state-of-the-art GAN's learned distribution still suffers from mode collapse and what to do if so. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xia:2021:SED, author = "Bohui Xia and Xueting Wang and Toshihiko Yamasaki", title = "Semantic Explanation for Deep Neural Networks Using Feature Interactions", journal = j-TOMM, volume = "17", number = "3s", pages = "115:1--115:19", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3474557", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3474557", abstract = "Given the promising results obtained by deep-learning techniques in multimedia analysis, the explainability of predictions made by networks has become important in practical applications. We present a method to generate semantic and quantitative \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:LDS, author = "Yang Wang and Yang Cao and Jing Zhang and Feng Wu and Zheng-Jun Zha", title = "Leveraging Deep Statistics for Underwater Image Enhancement", journal = j-TOMM, volume = "17", number = "3s", pages = "116:1--116:20", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3489520", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Dec 31 09:04:25 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3489520", abstract = "Underwater imaging often suffers from color cast and contrast degradation due to range-dependent medium absorption and light scattering. Introducing image statistics as prior has been proved to be an effective solution for underwater image enhancement. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "116", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2021:DSG, author = "Junyi Wu and Yan Huang and Qiang Wu and Zhipeng Gao and Jianqiang Zhao and Liqin Huang", title = "Dual-Stream Guided-Learning via a Priori Optimization for Person Re-identification", journal = j-TOMM, volume = "17", number = "4", pages = "117:1--117:22", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447715", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447715", abstract = "The task of person re-identification (re-ID) is to find the same pedestrian across non-overlapping camera views. Generally, the performance of person re-ID can be affected by background clutter. However, existing segmentation algorithms cannot obtain \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2021:ACO, author = "Zhaoliang He and Hongshan Li and Zhi Wang and Shutao Xia and Wenwu Zhu", title = "Adaptive Compression for Online Computer Vision: an Edge Reinforcement Learning Approach", journal = j-TOMM, volume = "17", number = "4", pages = "118:1--118:23", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447878", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3447878", abstract = "With the growth of computer vision-based applications, an explosive amount of images have been uploaded to cloud servers that host such online computer vision algorithms, usually in the form of deep learning models. JPEG has been used as the de facto. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2021:SDE, author = "Yingwei Pan and Yue Chen and Qian Bao and Ning Zhang and Ting Yao and Jingen Liu and Tao Mei", title = "{Smart Director}: an Event-Driven Directing System for Live Broadcasting", journal = j-TOMM, volume = "17", number = "4", pages = "119:1--119:18", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3448981", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3448981", abstract = "Live video broadcasting normally requires a multitude of skills and expertise with domain knowledge to enable multi-camera productions. As the number of cameras keeps increasing, directing a live sports broadcast has now become more complicated and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:DSS, author = "Chunyan Xu and Rong Liu and Tong Zhang and Zhen Cui and Jian Yang and Chunlong Hu", title = "Dual-Stream Structured Graph Convolution Network for Skeleton-Based Action Recognition", journal = j-TOMM, volume = "17", number = "4", pages = "120:1--120:22", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3450410", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3450410", abstract = "In this work, we propose a dual-stream structured graph convolution network (DS-SGCN) to solve the skeleton-based action recognition problem. The spatio-temporal coordinates and appearance contexts of the skeletal joints are jointly integrated into the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:UDE, author = "Jie Wang and Kaibin Tian and Dayong Ding and Gang Yang and Xirong Li", title = "Unsupervised Domain Expansion for Visual Categorization", journal = j-TOMM, volume = "17", number = "4", pages = "121:1--121:24", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3448108", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3448108", abstract = "Expanding visual categorization into a novel domain without the need of extra annotation has been a long-term interest for multimedia intelligence. Previously, this challenge has been approached by unsupervised domain adaptation (UDA). Given labeled data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mawalim:2021:TIR, author = "Candy Olivia Mawalim and Shogo Okada and Yukiko I. Nakano", title = "Task-independent Recognition of Communication Skills in Group Interaction Using Time-series Modeling", journal = j-TOMM, volume = "17", number = "4", pages = "122:1--122:27", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3450283", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3450283", abstract = "Case studies of group discussions are considered an effective way to assess communication skills (CS). This method can help researchers evaluate participants' engagement with each other in a specific realistic context. In this article, multimodal analysis \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:WTG, author = "Bo Zhang and Rui Zhang and Niccolo Bisagno and Nicola Conci and Francesco G. B. {De Natale} and Hongbo Liu", title = "Where Are They Going? {Predicting} Human Behaviors in Crowded Scenes", journal = j-TOMM, volume = "17", number = "4", pages = "123:1--123:19", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3449359", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3449359", abstract = "In this article, we propose a framework for crowd behavior prediction in complicated scenarios. The fundamental framework is designed using the standard encoder-decoder scheme, which is built upon the long short-term memory module to capture the temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Silva:2021:UMC, author = "Ellen P. Silva and Nat{\'a}lia Vieira and Glauco Amorim and Renata Mousinho and Gustavo Guedes and Gheorghita Ghinea and Joel A. F. {Dos Santos}", title = "Using Multisensory Content to Impact the Quality of Experience of Reading Digital Books", journal = j-TOMM, volume = "17", number = "4", pages = "124:1--124:18", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458676", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458676", abstract = "Multisensorial books enrich a story with either traditional multimedia content or sensorial effects. The main idea is to increase children's interest in reading by enhancing their QoE while reading. Studies on enriched and/or augmented e-books also \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiang:2021:BDC, author = "Weitao Jiang and Weixuan Wang and Haifeng Hu", title = "Bi-Directional Co-Attention Network for Image Captioning", journal = j-TOMM, volume = "17", number = "4", pages = "125:1--125:20", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460474", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460474", abstract = "Image Captioning, which automatically describes an image with natural language, is regarded as a fundamental challenge in computer vision. In recent years, significant advance has been made in image captioning through improving attention mechanism. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shen:2021:CDO, author = "Xiangjun Shen and Jinghui Zhou and Zhongchen Ma and Bingkun Bao and Zhengjun Zha", title = "Cross-Domain Object Representation via Robust Low-Rank Correlation Analysis", journal = j-TOMM, volume = "17", number = "4", pages = "126:1--126:20", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458825", abstract = "Cross-domain data has become very popular recently since various viewpoints and different sensors tend to facilitate better data representation. In this article, we propose a novel cross-domain object representation algorithm (RLRCA) which not only \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2021:CMH, author = "Xing Xu and Yifan Wang and Yixuan He and Yang Yang and Alan Hanjalic and Heng Tao Shen", title = "Cross-Modal Hybrid Feature Fusion for Image-Sentence Matching", journal = j-TOMM, volume = "17", number = "4", pages = "127:1--127:23", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458281", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458281", abstract = "Image-sentence matching is a challenging task in the field of language and vision, which aims at measuring the similarities between images and sentence descriptions. Most existing methods independently map the global features of images and sentences into \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Messina:2021:FGV, author = "Nicola Messina and Giuseppe Amato and Andrea Esuli and Fabrizio Falchi and Claudio Gennaro and St{\'e}phane Marchand-Maillet", title = "Fine-Grained Visual Textual Alignment for Cross-Modal Retrieval Using Transformer Encoders", journal = j-TOMM, volume = "17", number = "4", pages = "128:1--128:23", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451390", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3451390", abstract = "Despite the evolution of deep-learning-based visual-textual processing systems, precise multi-modal matching remains a challenging task. In this work, we tackle the task of cross-modal retrieval through image-sentence matching based on word-region \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2021:HSP, author = "Xuan Ma and Xiaoshan Yang and Junyu Gao and Changsheng Xu", title = "Health Status Prediction with Local-Global Heterogeneous Behavior Graph", journal = j-TOMM, volume = "17", number = "4", pages = "129:1--129:21", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457893", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3457893", abstract = "Health management is getting increasing attention all over the world. However, existing health management mainly relies on hospital examination and treatment, which are complicated and untimely. The emergence of mobile devices provides the possibility to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhai:2021:PQA, author = "Guangtao Zhai and Wei Sun and Xiongkuo Min and Jiantao Zhou", title = "Perceptual Quality Assessment of Low-light Image Enhancement", journal = j-TOMM, volume = "17", number = "4", pages = "130:1--130:24", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457905", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3457905", abstract = "Low-light image enhancement algorithms (LIEA) can light up images captured in dark or back-lighting conditions. However, LIEA may introduce various distortions such as structure damage, color shift, and noise into the enhanced images. Despite various \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mishra:2021:DBR, author = "Prerna Mishra and Santosh Kumar and Mithilesh Kumar Chaube", title = "Dissimilarity-Based Regularized Learning of Charts", journal = j-TOMM, volume = "17", number = "4", pages = "131:1--131:23", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458884", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458884", abstract = "Chart images exhibit significant variabilities that make each image different from others even though they belong to the same class or categories. Classification of charts is a major challenge because each chart class has variations in features, structure,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nandanwar:2021:NFB, author = "Lokesh Nandanwar and Palaiahnakote Shivakumara and Divya Krishnani and Raghavendra Ramachandra and Tong Lu and Umapada Pal and Mohan Kankanhalli", title = "A New Foreground-Background based Method for Behavior-Oriented Social Media Image Classification", journal = j-TOMM, volume = "17", number = "4", pages = "132:1--132:25", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458051", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458051", abstract = "Due to various applications, research on personal traits using information on social media has become an important area. In this paper, a new method for the classification of behavior-oriented social images uploaded on various social media platforms is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alahmadi:2021:ABS, author = "Mohannad Alahmadi and Peter Pocta and Hugh Melvin", title = "An Adaptive Bitrate Switching Algorithm for Speech Applications in Context of {WebRTC}", journal = j-TOMM, volume = "17", number = "4", pages = "133:1--133:21", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458751", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3458751", abstract = "Web Real-Time Communication (WebRTC) combines a set of standards and technologies to enable high-quality audio, video, and auxiliary data exchange in web browsers and mobile applications. It enables peer-to-peer multimedia sessions over IP networks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2021:FVS, author = "Wei Gao and Linjie Zhou and Lvfang Tao", title = "A Fast View Synthesis Implementation Method for Light Field Applications", journal = j-TOMM, volume = "17", number = "4", pages = "134:1--134:20", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3459098", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3459098", abstract = "View synthesis (VS) for light field images is a very time-consuming task due to the great quantity of involved pixels and intensive computations, which may prevent it from the practical three-dimensional real-time systems. In this article, we propose an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2021:BCR, author = "Jianhai Zhang and Zhiyong Feng and Yong Su and Meng Xing", title = "{Bayesian} Covariance Representation with Global Informative Prior for {$3$D} Action Recognition", journal = j-TOMM, volume = "17", number = "4", pages = "135:1--135:22", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460235", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460235", abstract = "For the merits of high-order statistics and Riemannian geometry, covariance matrix has become a generic feature representation for action recognition. An independent action can be represented by an empirical statistics over all of its pose samples. Two \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2021:PAP, author = "Anqi Zhu and Lin Zhang and Juntao Chen and Yicong Zhou", title = "Pedestrian-Aware Panoramic Video Stitching Based on a Structured Camera Array", journal = j-TOMM, volume = "17", number = "4", pages = "136:1--136:24", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460511", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460511", abstract = "The panorama stitching system is an indispensable module in surveillance or space exploration. Such a system enables the viewer to understand the surroundings instantly by aligning the surrounding images on a plane and fusing them naturally. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "136", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2021:NDB, author = "Yizhen Chen and Haifeng Hu", title = "{Y-Net}: Dual-branch Joint Network for Semantic Segmentation", journal = j-TOMM, volume = "17", number = "4", pages = "137:1--137:22", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460940", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460940", abstract = "Most existing segmentation networks are built upon a ``U-shaped'' encoder-decoder structure, where the multi-level features extracted by the encoder are gradually aggregated by the decoder. Although this structure has been proven to be effective in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "137", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2021:DNA, author = "Jinwei Wang and Wei Huang and Xiangyang Luo and Yun-Qing Shi and Sunil Kr. Jha", title = "Detecting Non-Aligned Double {JPEG} Compression Based on Amplitude-Angle Feature", journal = j-TOMM, volume = "17", number = "4", pages = "138:1--138:18", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3464388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3464388", abstract = "Due to the popularity of JPEG format images in recent years, JPEG images will inevitably involve image editing operation. Thus, some tramped images will leave tracks of Non-aligned double JPEG (NA-DJPEG) compression. By detecting the presence of NA-DJPEG \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "138", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jia:2021:RGL, author = "Wei Jia and Li Li and Zhu Li and Xiang Zhang and Shan Liu", title = "Residual-guided In-loop Filter Using Convolution Neural Network", journal = j-TOMM, volume = "17", number = "4", pages = "139:1--139:19", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460820", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460820", abstract = "The block-based coding structure in the hybrid video coding framework inevitably introduces compression artifacts such as blocking, ringing, and so on. To compensate for those artifacts, extensive filtering techniques were proposed in the loop of video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "139", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2021:TMF, author = "Zhihan Lv and Houbing Song", title = "Trust Mechanism of Feedback Trust Weight in Multimedia Network", journal = j-TOMM, volume = "17", number = "4", pages = "140:1--140:26", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3391296", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Jan 14 07:01:30 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3391296", abstract = "It is necessary to solve the inaccurate data arising from data reliability ignored by most data fusion algorithms drawing upon collaborative filtering and fuzzy network theory. Therefore, a model is constructed based on the collaborative filtering \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "140", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yao:2022:SLM, author = "Peng Yao and Jieqing Feng", title = "Sparse {LIDAR} Measurement Fusion with Joint Updating Cost for Fast Stereo Matching", journal = j-TOMM, volume = "18", number = "1", pages = "1:1--1:18", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3471870", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3471870", abstract = "The complementary virtues of active and passive depth sensors inspire the LIDAR-Stereo fusion for enhancing the accuracy of stereo matching. However, most of the fusion based stereo matching algorithms have exploited dense LIDAR priors with single fusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Karagkioules:2022:OLA, author = "Theodoros Karagkioules and Georgios S. Paschos and Nikolaos Liakopoulos and Attilio Fiandrotti and Dimitrios Tsilimantos and Marco Cagnazzo", title = "Online Learning for Adaptive Video Streaming in Mobile Networks", journal = j-TOMM, volume = "18", number = "1", pages = "2:1--2:22", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460819", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3460819", abstract = "In this paper, we propose a novel algorithm for video bitrate adaptation in HTTP Adaptive Streaming (HAS), based on online learning. The proposed algorithm, named Learn2Adapt (L2A), is shown to provide a robust bitrate adaptation strategy which, unlike \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fan:2022:MUE, author = "Ching-Ling Fan and Tse-Hou Hung and Cheng-Hsin Hsu", title = "Modeling the User Experience of Watching 360${}^\circ $ Videos with Head-Mounted Displays", journal = j-TOMM, volume = "18", number = "1", pages = "3:1--3:23", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3463825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3463825", abstract = "Conducting user studies to quantify the Quality of Experience (QoE) of watching the increasingly more popular 360${}^\circ $ videos in Head-Mounted Displays (HMDs) is time-consuming, tedious, and expensive. Deriving QoE models, however, is very challenging because \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{S:2022:TRL, author = "Baiju P. S. and Sudhish N. George", title = "{TTV} Regularized {LRTA} Technique for the Estimation of Haze Model Parameters in Video Dehazing", journal = j-TOMM, volume = "18", number = "1", pages = "4:1--4:22", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3465454", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3465454", abstract = "Nowadays, intelligent transport systems have a major role in providing a safe and secure traffic society for passengers, pedestrians, and vehicles. However, some bad weather conditions such as haze or fog may affect the visual clarity of video footage \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Aloufi:2022:MDT, author = "Samah Aloufi and Abdulmotaleb {El Saddik}", title = "{MMSUM} Digital Twins: a Multi-view Multi-modality Summarization Framework for Sporting Events", journal = j-TOMM, volume = "18", number = "1", pages = "5:1--5:25", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462777", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462777", abstract = "Sporting events generate a massive amount of traffic on social media with live moment-to-moment accounts as any given situation unfolds. The generated data are intensified by fans feelings, reactions, and subjective opinions towards what happens during \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:MFF, author = "Zhoutao Wang and Qian Xie and Mingqiang Wei and Kun Long and Jun Wang", title = "Multi-feature Fusion {VoteNet} for {$3$D} Object Detection", journal = j-TOMM, volume = "18", number = "1", pages = "6:1--6:17", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462219", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462219", abstract = "In this article, we propose a Multi-feature Fusion VoteNet (MFFVoteNet) framework for improving the 3D object detection performance in cluttered and heavily occluded scenes. Our method takes the point cloud and the synchronized RGB image as inputs to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Uddin:2022:NMM, author = "Md Azher Uddin and Joolekha Bibi Joolee and Young-Koo Lee and Kyung-Ah Sohn", title = "A Novel Multi-Modal Network-Based Dynamic Scene Understanding", journal = j-TOMM, volume = "18", number = "1", pages = "7:1--7:19", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462218", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462218", abstract = "In recent years, dynamic scene understanding has gained attention from researchers because of its widespread applications. The main important factor in successfully understanding the dynamic scenes lies in jointly representing the appearance and motion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:FEA, author = "Shiguang Liu and Huixin Wang and Min Pei", title = "Facial-expression-aware Emotional Color Transfer Based on Convolutional Neural Network", journal = j-TOMM, volume = "18", number = "1", pages = "8:1--8:19", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3464382", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3464382", abstract = "Emotional color transfer aims to change the evoked emotion of a source image to that of a target image by adjusting color distribution. Most of existing emotional color transfer methods only consider the low-level visual features of an image and ignore \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{PeresRebelo:2022:IAI, author = "Ana Daniela {Peres Rebelo} and Guedes {De Oliveira In{\^e}s} and D. E. Verboom Damion", title = "The Impact of Artificial Intelligence on the Creativity of Videos", journal = j-TOMM, volume = "18", number = "1", pages = "9:1--9:27", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462634", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462634", abstract = "This study explored the impact Artificial Intelligence (AI) has on the evaluation of creative elements in artistic videos. The aim was to verify to what extent the use of an AI algorithm (Style Transfer) contributes to changes in the perceived creativity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2022:LHV, author = "Yaguang Song and Junyu Gao and Xiaoshan Yang and Changsheng Xu", title = "Learning Hierarchical Video Graph Networks for One-Stop Video Delivery", journal = j-TOMM, volume = "18", number = "1", pages = "10:1--10:23", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3466886", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3466886", abstract = "The explosive growth of video data has brought great challenges to video retrieval, which aims to find out related videos from a video collection. Most users are usually not interested in all the content of retrieved videos but have a more fine-grained \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mao:2022:MGD, author = "Aihua Mao and Yuan Liang and Jianbo Jiao and Yongtuo Liu and Shengfeng He", title = "Mask-Guided Deformation Adaptive Network for Human Parsing", journal = j-TOMM, volume = "18", number = "1", pages = "11:1--11:20", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3467889", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3467889", abstract = "Due to the challenges of densely compacted body parts, nonrigid clothing items, and severe overlap in crowd scenes, human parsing needs to focus more on multilevel feature representations compared to general scene parsing tasks. Based on this observation, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tiotsop:2022:MIM, author = "Lohic Fotio Tiotsop and Tomas Mizdos and Marcus Barkowsky and Peter Pocta and Antonio Servetti and Enrico Masala", title = "Mimicking Individual Media Quality Perception with Neural Network based Artificial Observers", journal = j-TOMM, volume = "18", number = "1", pages = "12:1--12:25", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3464393", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3464393", abstract = "The media quality assessment research community has traditionally been focusing on developing objective algorithms to predict the result of a typical subjective experiment in terms of Mean Opinion Score (MOS) value. However, the MOS, being a single value, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Thong:2022:DSV, author = "William Thong and Cees G. M. Snoek", title = "Diversely-Supervised Visual Product Search", journal = j-TOMM, volume = "18", number = "1", pages = "13:1--13:22", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3461646", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3461646", abstract = "This article strives for a diversely supervised visual product search, where queries specify a diverse set of labels to search for. Where previous works have focused on representing attribute, instance, or category labels individually, we consider them \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Farhat:2022:CCC, author = "Farshid Farhat and Mohammad Mahdi Kamani and James Z. Wang", title = "{CAPTAIN}: Comprehensive Composition Assistance for Photo Taking", journal = j-TOMM, volume = "18", number = "1", pages = "14:1--14:24", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462762", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462762", abstract = "Many people are interested in taking astonishing photos and sharing them with others. Emerging high-tech hardware and software facilitate the ubiquitousness and functionality of digital photography. Because composition matters in photography, researchers \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Holloman:2022:DSS, author = "Amanda K. Holloman and Chris S. Crawford", title = "Defining Scents: a Systematic Literature Review of Olfactory-based Computing Systems", journal = j-TOMM, volume = "18", number = "1", pages = "15:1--15:22", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3470975", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3470975", abstract = "The human sense of smell is a primal ability that has the potential to reveal unexplored relationships between user behaviors and technology. Humans use millions of olfactory receptor cells to observe the environment around them. Olfaction studies are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2022:HIR, author = "Xian-Hua Han and Yinqiang Zheng and Yen-Wei Chen", title = "Hyperspectral Image Reconstruction Using Multi-scale Fusion Learning", journal = j-TOMM, volume = "18", number = "1", pages = "16:1--16:21", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477396", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3477396", abstract = "Hyperspectral imaging is a promising imaging modality that simultaneously captures several images for the same scene on narrow spectral bands, and it has made considerable progress in different fields, such as agriculture, astronomy, and surveillance. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tasaka:2022:EMC, author = "Shuji Tasaka", title = "An Empirical Method for Causal Inference of Constructs for {QoE} in Haptic-Audiovisual Communications", journal = j-TOMM, volume = "18", number = "1", pages = "17:1--17:24", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473986", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473986", abstract = "This article proposes an empirical method for inferring causal directions in multidimensional Quality of Experience (QoE) in multimedia communications, noting that causation in QoE is perceptual. As an example for modeling framework, we pick up a Bayesian \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2022:RIT, author = "Dongbao Yang and Yu Zhou and Wei Shi and Dayan Wu and Weiping Wang", title = "{RD-IOD}: Two-Level Residual-Distillation-Based Triple-Network for Incremental Object Detection", journal = j-TOMM, volume = "18", number = "1", pages = "18:1--18:23", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472393", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472393", abstract = "As a basic component in multimedia applications, object detectors are generally trained on a fixed set of classes that are pre-defined. However, new object classes often emerge after the models are trained in practice. Modern object detectors based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hsu:2022:OIV, author = "Chih-Fan Hsu and Tse-Hou Hung and Cheng-Hsin Hsu", title = "Optimizing Immersive Video Coding Configurations Using Deep Learning: a Case Study on {TMIV}", journal = j-TOMM, volume = "18", number = "1", pages = "19:1--19:25", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3471191", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3471191", abstract = "Immersive video streaming technologies improve Virtual Reality (VR) user experience by providing users more intuitive ways to move in simulated worlds, e.g., with 6 Degree-of-Freedom (6DoF) interaction mode. A naive method to achieve 6DoF is deploying \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Siegfried:2022:RUG, author = "R{\'e}my Siegfried and Jean-Marc Odobez", title = "Robust Unsupervised Gaze Calibration Using Conversation and Manipulation Attention Priors", journal = j-TOMM, volume = "18", number = "1", pages = "20:1--20:27", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472622", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472622", abstract = "Gaze estimation is a difficult task, even for humans. However, as humans, we are good at understanding a situation and exploiting it to guess the expected visual focus of attention of people, and we usually use this information to retrieve people's gaze. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:LLS, author = "Jing Wang and Weiqing Min and Sujuan Hou and Shengnan Ma and Yuanjie Zheng and Shuqiang Jiang", title = "{LogoDet-3K}: a Large-scale Image Dataset for Logo Detection", journal = j-TOMM, volume = "18", number = "1", pages = "21:1--21:19", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3466780", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3466780", abstract = "Logo detection has been gaining considerable attention because of its wide range of applications in the multimedia field, such as copyright infringement detection, brand visibility monitoring, and product brand management on social media. In this article, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:ALC, author = "Da-Chun Wu and Yu-Tsung Hsu", title = "Authentication of {LINE} Chat History Files by Information Hiding", journal = j-TOMM, volume = "18", number = "1", pages = "22:1--22:23", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474225", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3474225", abstract = "With the prevalence of smartphones, message exchanges via mobile chatting programs like LINE have become popular. The messages in the form of chat records in a LINE chat history, after being downloaded for legal uses, might be tampered with illicitly. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:PPM, author = "Changming Liu and Xiaojing Ma and Sixing Cao and Jiayun Fu and Bin B. Zhu", title = "Privacy-preserving Motion Detection for {HEVC}-compressed Surveillance Video", journal = j-TOMM, volume = "18", number = "1", pages = "23:1--23:27", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472669", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:22:44 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472669", abstract = "In the cloud era, a large amount of data is uploaded to and processed by public clouds. The risk of privacy leakage has become a major concern for cloud users. Cloud-based video surveillance requires motion detection, which may reveal the privacy of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:ISIa, author = "Shiliang Zhang and Guorong Li and Weigang Zhang and Qingming Huang and Tiejun Huang and Mubarak Shah and Nicu Sebe", title = "Introduction to the Special Issue on Fine-Grained Visual Recognition and Re-Identification", journal = j-TOMM, volume = "18", number = "1s", pages = "24:1--24:3", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505280", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3505280", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:HMM, author = "La Zhang and Haiyun Guo and Kuan Zhu and Honglin Qiao and Gaopan Huang and Sen Zhang and Huichen Zhang and Jian Sun and Jinqiao Wang", title = "Hybrid Modality Metric Learning for Visible-Infrared Person Re-Identification", journal = j-TOMM, volume = "18", number = "1s", pages = "25:1--25:15", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473341", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473341", abstract = "Visible-infrared person re-identification (Re-ID) has received increasing research attention for its great practical value in night-time surveillance scenarios. Due to the large variations in person pose, viewpoint, and occlusion in the same modality, as \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2022:BIB, author = "Sheng Xu and Chang Liu and Baochang Zhang and Jinhu L{\"u} and Guodong Guo and David Doermann", title = "{BiRe-ID}: Binary Neural Network for Efficient Person Re-{ID}", journal = j-TOMM, volume = "18", number = "1s", pages = "26:1--26:22", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473340", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473340", abstract = "Person re-identification (Re-ID) has been promoted by the significant success of convolutional neural networks (CNNs). However, the application of such CNN-based Re-ID methods depends on the tremendous consumption of computation and memory resources, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2022:JGF, author = "Zhongwei Zhao and Ran Song and Qian Zhang and Peng Duan and Youmei Zhang", title = "{JoT-GAN}: a Framework for Jointly Training {GAN} and Person Re-Identification Model", journal = j-TOMM, volume = "18", number = "1s", pages = "27:1--27:18", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491225", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491225", abstract = "To cope with the problem caused by inadequate training data, many person re-identification (re-id) methods exploit generative adversarial networks (GAN) for data augmentation, where the training of GAN is typically independent of that of the re-id model. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2022:SCP, author = "Liqian Liang and Congyan Lang and Zun Li and Jian Zhao and Tao Wang and Songhe Feng", title = "Seeing Crucial Parts: Vehicle Model Verification via a Discriminative Representation Model", journal = j-TOMM, volume = "18", number = "1s", pages = "28:1--28:22", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474596", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3474596", abstract = "Widely used surveillance cameras have promoted large amounts of street scene data, which contains one important but long-neglected object: the vehicle. Here we focus on the challenging problem of vehicle model verification. Most previous works usually \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2022:AIF, author = "Chenggang Yan and Lixuan Meng and Liang Li and Jiehua Zhang and Zhan Wang and Jian Yin and Jiyong Zhang and Yaoqi Sun and Bolun Zheng", title = "Age-Invariant Face Recognition by Multi-Feature Fusionand Decomposition with Self-attention", journal = j-TOMM, volume = "18", number = "1s", pages = "29:1--29:18", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472810", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472810", abstract = "Different from general face recognition, age-invariant face recognition (AIFR) aims at matching faces with a big age gap. Previous discriminative methods usually focus on decomposing facial feature into age-related and age-invariant components, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhai:2022:RML, author = "Deming Zhai and Ruifeng Shi and Junjun Jiang and Xianming Liu", title = "Rectified Meta-learning from Noisy Labels for Robust Image-based Plant Disease Classification", journal = j-TOMM, volume = "18", number = "1s", pages = "30:1--30:17", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472809", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472809", abstract = "Plant diseases serve as one of main threats to food security and crop production. It is thus valuable to exploit recent advances of artificial intelligence to assist plant disease diagnosis. One popular approach is to transform this problem as a leaf \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tan:2022:FGI, author = "Min Tan and Fu Yuan and Jun Yu and Guijun Wang and Xiaoling Gu", title = "Fine-grained Image Classification via Multi-scale Selective Hierarchical Biquadratic Pooling", journal = j-TOMM, volume = "18", number = "1s", pages = "31:1--31:23", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492221", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3492221", abstract = "How to extract distinctive features greatly challenges the fine-grained image classification tasks. In previous models, bilinear pooling has been frequently adopted to address this problem. However, most bilinear pooling models neglect either intra or \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cucchiara:2022:FGH, author = "Rita Cucchiara and Matteo Fabbri", title = "Fine-grained Human Analysis under Occlusions and Perspective Constraints in Multimedia Surveillance", journal = j-TOMM, volume = "18", number = "1s", pages = "32:1--32:23", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3476839", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3476839", abstract = "Human detection in the wild is a research topic of paramount importance in computer vision, and it is the starting step for designing intelligent systems oriented to human interaction that work in complete autonomy. To achieve this goal, computer vision \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:ICG, author = "Lei Wu and Hefei Ling and Yuxuan Shi and Baiyan Zhang", title = "Instance Correlation Graph for Unsupervised Domain Adaptation", journal = j-TOMM, volume = "18", number = "1s", pages = "33:1--33:23", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3486251", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3486251", abstract = "In recent years, deep neural networks have emerged as a dominant machine learning tool for a wide variety of application fields. Due to the expensive cost of manual labeling efforts, it is important to transfer knowledge from a label-rich source domain to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mugnai:2022:FGA, author = "Daniele Mugnai and Federico Pernici and Francesco Turchini and Alberto {Del Bimbo}", title = "Fine-Grained Adversarial Semi-Supervised Learning", journal = j-TOMM, volume = "18", number = "1s", pages = "34:1--34:19", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485473", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485473", abstract = "In this article, we exploit Semi-Supervised Learning (SSL) to increase the amount of training data to improve the performance of Fine-Grained Visual Categorization (FGVC). This problem has not been investigated in the past in spite of prohibitive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Luo:2022:ERU, author = "Dezhao Luo and Yu Zhou and Bo Fang and Yucan Zhou and Dayan Wu and Weiping Wang", title = "Exploring Relations in Untrimmed Videos for Self-Supervised Learning", journal = j-TOMM, volume = "18", number = "1s", pages = "35:1--35:21", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473342", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473342", abstract = "Existing video self-supervised learning methods mainly rely on trimmed videos for model training. They apply their methods and verify the effectiveness on trimmed video datasets including UCF101 and Kinetics-400, among others. However, trimmed datasets \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:EEC, author = "Yabin Wang and Zhiheng Ma and Xing Wei and Shuai Zheng and Yaowei Wang and Xiaopeng Hong", title = "{ECCNAS}: Efficient Crowd Counting Neural Architecture Search", journal = j-TOMM, volume = "18", number = "1s", pages = "36:1--36:19", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3465455", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3465455", abstract = "Recent solutions to crowd counting problems have already achieved promising performance across various benchmarks. However, applying these approaches to real-world applications is still challenging, because they are computation intensive and lack the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:CFH, author = "Wenxu Li and Gang Pan and Chen Wang and Zhen Xing and Zhenjun Han", title = "From Coarse to Fine: Hierarchical Structure-aware Video Summarization", journal = j-TOMM, volume = "18", number = "1s", pages = "37:1--37:16", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485472", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485472", abstract = "Hierarchical structure is a common characteristic for some kinds of videos (e.g., sports videos, game videos): The videos are composed of several actions hierarchically and there exist temporal dependencies among segments with different scales, where \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hossain:2022:SSA, author = "M. Shamim Hossain and Rita Cucchiara and Ghulam Muhammad and Diana P. Tob{\'o}n and Abdulmotaleb {El Saddik}", title = "Special Section on {AI-empowered} Multimedia Data Analytics for Smart Healthcare", journal = j-TOMM, volume = "18", number = "1s", pages = "38:1--38:2", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505281", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3505281", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2022:MFT, author = "Min Chen and Wenjing Xiao and Miao Li and Yixue Hao and Long Hu and Guangming Tao", title = "A Multi-feature and Time-aware-based Stress Evaluation Mechanism for Mental Status Adjustment", journal = j-TOMM, volume = "18", number = "1s", pages = "39:1--39:18", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3462763", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3462763", abstract = "With the rapid economic development, the prominent social competition has led to increasing psychological pressure of people felt from each aspect of life. Driven by the Internet of Things and artificial intelligence, intelligent psychological pressure \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Masud:2022:CNN, author = "Mehedi Masud and Mohammed F. Alhamid and Yin Zhang", title = "A Convolutional Neural Network Model Using Weighted Loss Function to Detect Diabetic Retinopathy", journal = j-TOMM, volume = "18", number = "1s", pages = "40:1--40:16", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3470976", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3470976", abstract = "Nowadays, artificial intelligence (AI) provides tremendous prospects for driving future healthcare while empowering patients and service providers. The extensive use of digital healthcare produces a massive amount of multimedia healthcare data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:TTM, author = "Debin Liu and Laurence T. Yang and Puming Wang and Ruonan Zhao and Qingchen Zhang", title = "{TT-TSVD}: a Multi-modal Tensor Train Decomposition with Its Application in Convolutional Neural Networks for Smart Healthcare", journal = j-TOMM, volume = "18", number = "1s", pages = "41:1--41:17", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491223", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491223", abstract = "Smart healthcare systems are generating a large scale of heterogeneous high-dimensional data with complex relationships. It is hard for current methods to analyze such high-dimensional healthcare data. Specifically, the traditional data reduction methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2022:MNM, author = "Chun-Wei Yang and Thanh Hai Phung and Hong-Han Shuai and Wen-Huang Cheng", title = "Mask or Non-Mask? {Robust} Face Mask Detector via Triplet-Consistency Representation Learning", journal = j-TOMM, volume = "18", number = "1s", pages = "42:1--42:20", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472623", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472623", abstract = "In the absence of vaccines or medicines to stop COVID-19, one of the effective methods to slow the spread of the coronavirus and reduce the overloading of healthcare is to wear a face mask. Nevertheless, to mandate the use of face masks or coverings in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2022:DLB, author = "Zhihan Lv and Zengchen Yu and Shuxuan Xie and Atif Alamri", title = "Deep Learning-based Smart Predictive Evaluation for Interactive Multimedia-enabled Smart Healthcare", journal = j-TOMM, volume = "18", number = "1s", pages = "43:1--43:20", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3468506", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:52 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3468506", abstract = "Two-dimensional arrays of bi-component structures made of cobalt and permalloy elliptical dots with thickness of 25 nm, length 1 mm and width of 225 nm, have been prepared by a self-aligned shadow deposition technique. Brillouin light scattering has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Amirpour:2022:ELF, author = "Hadi Amirpour and Antonio Pinheiro and Manuela Pereira and Fernando J. P. Lopes and Mohammad Ghanbari", title = "Efficient Light Field Image Compression with Enhanced Random Access", journal = j-TOMM, volume = "18", number = "2", pages = "44:1--44:18", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3471905", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3471905", abstract = "In light field image compression, facilitating random access to individual views plays a significant role in decoding views quickly, reducing memory footprint, and decreasing the bandwidth requirement for transmission. Highly efficient light field image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Morillo:2022:EIP, author = "Pedro Morillo and Jos{\'e} J. Navarro-P{\'e}rez and Juan M. Ordu{\~n}a and Marcos Fern{\'a}ndez", title = "Evaluation of an Intervention Program Based on Mobile Apps to Learn Sexism Prevention in Teenagers", journal = j-TOMM, volume = "18", number = "2", pages = "45:1--45:20", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3471139", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3471139", abstract = "The fight against sexism is nowadays one of the flagship social movements in western countries. Adolescence is a crucial period, and some empirical studies have focused on the socialization of teenagers, proving that the socialization with the surrounding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2022:LTS, author = "Yansong Tang and Xingyu Liu and Xumin Yu and Danyang Zhang and Jiwen Lu and Jie Zhou", title = "Learning from Temporal Spatial Cubism for Cross-Dataset Skeleton-based Action Recognition", journal = j-TOMM, volume = "18", number = "2", pages = "46:1--46:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3472722", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3472722", abstract = "Rapid progress and superior performance have been achieved for skeleton-based action recognition recently. In this article, we investigate this problem under a cross-dataset setting, which is a new, pragmatic, and challenging task in real-world scenarios. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kizilkaya:2022:EFF, author = "Burak Kizilkaya and Enver Ever and Hakan Yekta Yatbaz and Adnan Yazici", title = "An Effective Forest Fire Detection Framework Using Heterogeneous Wireless Multimedia Sensor Networks", journal = j-TOMM, volume = "18", number = "2", pages = "47:1--47:21", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473037", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473037", abstract = "With improvements in the area of Internet of Things (IoT), surveillance systems have recently become more accessible. At the same time, optimizing the energy requirements of smart sensors, especially for data transmission, has always been very important \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:UEU, author = "Yehao Li and Jiahao Fan and Yingwei Pan and Ting Yao and Weiyao Lin and Tao Mei", title = "{Uni-EDEN}: Universal Encoder-Decoder Network by Multi-Granular Vision-Language Pre-training", journal = j-TOMM, volume = "18", number = "2", pages = "48:1--48:16", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3473140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3473140", abstract = "Vision-language pre-training has been an emerging and fast-developing research topic, which transfers multi-modal knowledge from rich-resource pre-training task to limited-resource downstream tasks. Unlike existing works that predominantly learn a single \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2022:CSL, author = "Shenming Feng and Xingzhong Nong and Haifeng Hu", title = "Cascaded Structure-Learning Network with Using Adversarial Training for Robust Facial Landmark Detection", journal = j-TOMM, volume = "18", number = "2", pages = "49:1--49:20", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474595", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3474595", abstract = "Recently, great progress has been achieved on facial landmark detection based on convolutional neural network, while it is still challenging due to partial occlusion and extreme head pose. In this paper, we propose a Cascaded Structure-Learning Network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Damme:2022:MLB, author = "Sam {Van Damme} and Maria {Torres Vega} and Filip {De Turck}", title = "Machine Learning Based Content-Agnostic Viewport Prediction for 360-Degree Video", journal = j-TOMM, volume = "18", number = "2", pages = "50:1--50:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3474833", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3474833", abstract = "Accurate and fast estimations or predictions of the (near) future location of the users of head-mounted devices within the virtual omnidirectional environment open a plethora of opportunities in application domains such as interactive immersive gaming and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yeh:2022:GVW, author = "Chih-Kuo Yeh and Thi-Ngoc-Hanh Le and Zhi-Ying Hou and Tong-Yee Lee", title = "Generating Virtual Wire Sculptural Art from {$3$D} Models", journal = j-TOMM, volume = "18", number = "2", pages = "51:1--51:23", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3475798", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3475798", abstract = "Wire sculptures are objects sculpted by the use of wires. In this article, we propose practical methods to create 3D virtual wire sculptural art from a given 3D model. In contrast, most of the previous 3D wire art results are reconstructed from input 2D \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2022:RGJ, author = "Teng Sun and Chun Wang and Xuemeng Song and Fuli Feng and Liqiang Nie", title = "Response Generation by Jointly Modeling Personalized Linguistic Styles and Emotions", journal = j-TOMM, volume = "18", number = "2", pages = "52:1--52:20", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3475872", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3475872", abstract = "Natural language generation (NLG) has been an essential technique for various applications, like XiaoIce and Siri, and engaged increasing attention recently. To improve the user experience, several emotion-aware NLG methods have been developed to generate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Francis:2022:GRS, author = "Jobin Francis and M. Baburaj and Sudhish N. George", title = "An $ l_{1 / 2} $ and Graph Regularized Subspace Clustering Method for Robust Image Segmentation", journal = j-TOMM, volume = "18", number = "2", pages = "53:1--53:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3476514", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3476514", abstract = "Segmenting meaningful visual structures from an image is a fundamental and most-addressed problem in image analysis algorithms. However, among factors such as diverse visual patterns, noise, complex backgrounds, and similar textures present in foreground \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:WYE, author = "Jiahao Wang and Yunhong Wang and Nina Weng and Tianrui Chai and Annan Li and Faxi Zhang and Samsi Yu", title = "Will You Ever Become Popular? {Learning} to Predict Virality of Dance Clips", journal = j-TOMM, volume = "18", number = "2", pages = "54:1--54:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477533", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3477533", abstract = "Dance challenges are going viral in video communities like TikTok nowadays. Once a challenge becomes popular, thousands of short-form videos will be uploaded within a couple of days. Therefore, virality prediction from dance challenges is of great \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhong:2022:DSA, author = "Sheng-Hua Zhong and Jingxu Lin and Jianglin Lu and Ahmed Fares and Tongwei Ren", title = "Deep Semantic and Attentive Network for Unsupervised Video Summarization", journal = j-TOMM, volume = "18", number = "2", pages = "55:1--55:21", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477538", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3477538", abstract = "With the rapid growth of video data, video summarization is a promising approach to shorten a lengthy video into a compact version. Although supervised summarization approaches have achieved state-of-the-art performance, they require frame-level annotated \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2022:MIL, author = "Yawen Zeng and Da Cao and Shaofei Lu and Hanling Zhang and Jiao Xu and Zheng Qin", title = "Moment is Important: Language-Based Video Moment Retrieval via Adversarial Learning", journal = j-TOMM, volume = "18", number = "2", pages = "56:1--56:21", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3478025", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3478025", abstract = "The newly emerging language-based video moment retrieval task aims at retrieving a target video moment from an untrimmed video given a natural language as the query. It is more applicable in reality since it is able to accurately localize a specific video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:LTP, author = "Hanjie Wu and Yongtuo Liu and Hongmin Cai and Shengfeng He", title = "Learning Transferable Perturbations for Image Captioning", journal = j-TOMM, volume = "18", number = "2", pages = "57:1--57:18", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3478024", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3478024", abstract = "Present studies have discovered that state-of-the-art deep learning models can be attacked by small but well-designed perturbations. Existing attack algorithms for the image captioning task is time-consuming, and their generated adversarial examples \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2022:SSS, author = "Ziyi Sun and Yunfeng Zhang and Fangxun Bao and Ping Wang and Xunxiang Yao and Caiming Zhang", title = "{SADnet}: Semi-supervised Single Image Dehazing Method Based on an Attention Mechanism", journal = j-TOMM, volume = "18", number = "2", pages = "58:1--58:23", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3478457", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3478457", abstract = "Many real-life tasks such as military reconnaissance and traffic monitoring require high-quality images. However, images acquired in foggy or hazy weather pose obstacles to the implementation of these real-life tasks; consequently, image dehazing is an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:TIS, author = "Feifei Zhang and Mingliang Xu and Changsheng Xu", title = "Tell, Imagine, and Search: End-to-end Learning for Composing Text and Image to Image Retrieval", journal = j-TOMM, volume = "18", number = "2", pages = "59:1--59:23", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3478642", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3478642", abstract = "Composing Text and Image to Image Retrieval ( CTI-IR ) is an emerging task in computer vision, which allows retrieving images relevant to a query image with text describing desired modifications to the query image. Most conventional cross-modal retrieval \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2022:SAM, author = "Haoyu Ma and Bingchen Gong and Yizhou Yu", title = "Structure-aware Meta-fusion for Image Super-resolution", journal = j-TOMM, volume = "18", number = "2", pages = "60:1--60:25", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477553", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3477553", abstract = "There are two main categories of image super-resolution algorithms: distortion oriented and perception oriented. Recent evidence shows that reconstruction accuracy and perceptual quality are typically in disagreement with each other. In this article, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tahir:2022:NAT, author = "Madiha Tahir and Zahid Halim and Atta Ur Rahman and Muhammad Waqas and Shanshan Tu and Sheng Chen and Zhu Han", title = "Non-Acted Text and Keystrokes Database and Learning Methods to Recognize Emotions", journal = j-TOMM, volume = "18", number = "2", pages = "61:1--61:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3480968", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3480968", abstract = "The modern computing applications are presently adapting to the convenient availability of huge and diverse data for making their pattern recognition methods smarter. Identification of dominant emotion solely based on the text data generated by humans is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fincato:2022:TWD, author = "Matteo Fincato and Marcella Cornia and Federico Landi and Fabio Cesari and Rita Cucchiara", title = "Transform, Warp, and Dress: a New Transformation-guided Model for Virtual Try-on", journal = j-TOMM, volume = "18", number = "2", pages = "62:1--62:24", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491226", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491226", abstract = "Virtual try-on has recently emerged in computer vision and multimedia communities with the development of architectures that can generate realistic images of a target person wearing a custom garment. This research interest is motivated by the large role \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2022:AMG, author = "Ning Han and Jingjing Chen and Hao Zhang and Huanwen Wang and Hao Chen", title = "Adversarial Multi-Grained Embedding Network for Cross-Modal Text-Video Retrieval", journal = j-TOMM, volume = "18", number = "2", pages = "63:1--63:23", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3483381", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3483381", abstract = "Cross-modal retrieval between texts and videos has received consistent research interest in the multimedia community. Existing studies follow a trend of learning a joint embedding space to measure the distance between text and video representations. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pang:2022:FUP, author = "Bo Pang and Deming Zhai and Junjun Jiang and Xianming Liu", title = "Fully Unsupervised Person Re-Identification via Selective Contrastive Learning", journal = j-TOMM, volume = "18", number = "2", pages = "64:1--64:15", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485061", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485061", abstract = "Person re-identification (ReID) aims at searching the same identity person among images captured by various cameras. Existing fully supervised person ReID methods usually suffer from poor generalization capability caused by domain gaps. Unsupervised \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhuang:2022:MAD, author = "Wenlin Zhuang and Congyi Wang and Jinxiang Chai and Yangang Wang and Ming Shao and Siyu Xia", title = "{Music2Dance}: {DanceNet} for Music-Driven Dance Generation", journal = j-TOMM, volume = "18", number = "2", pages = "65:1--65:21", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485664", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485664", abstract = "Synthesize human motions from music (i.e., music to dance) is appealing and has attracted lots of research interests in recent years. It is challenging because of the requirement for realistic and complex human motions for dance, but more importantly, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cetinic:2022:UCA, author = "Eva Cetinic and James She", title = "Understanding and Creating Art with {AI}: Review and Outlook", journal = j-TOMM, volume = "18", number = "2", pages = "66:1--66:22", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3475799", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:54 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3475799", abstract = "Technologies related to artificial intelligence (AI) have a strong impact on the changes of research and creative practices in visual arts. The growing number of research initiatives and creative applications that emerge in the intersection of AI and art \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:DVS, author = "Zheng Zhang and Jianning Wang and Lei Zhu and Guangming Lu", title = "Discriminative Visual Similarity Search with Semantically Cycle-consistent Hashing Networks", journal = j-TOMM, volume = "18", number = "2s", pages = "114:1--114:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3532519", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532519", abstract = "Deep hashing has great potential in large-scale visual similarity search due to its preferable efficiency in storage and computation. Technically, deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ge:2022:DVD, author = "Shiming Ge and Fanzhao Lin and Chenyu Li and Daichi Zhang and Weiping Wang and Dan Zeng", title = "Deepfake Video Detection via Predictive Representation Learning", journal = j-TOMM, volume = "18", number = "2s", pages = "115:1--115:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3536426", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3536426", abstract = "Increasingly advanced deepfake approaches have made the detection of deepfake videos very challenging. We observe that the general deepfake videos often \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Galteri:2022:LLB, author = "Leonardo Galteri and Lorenzo Seidenari and Pietro Bongini and Marco Bertini and Alberto {Del Bimbo}", title = "{LANBIQUE}: {LANguage-based Blind Image QUality Evaluation}", journal = j-TOMM, volume = "18", number = "2s", pages = "116:1--116:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3538649", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538649", abstract = "Image quality assessment is often performed with deep networks that are fine-tuned to regress a human provided quality score of a given image. Usually, this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "116", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2022:SCC, author = "Zhihan Lv and Dongliang Chen and Haibin Lv", title = "Smart City Construction and Management by Digital Twins and {BIM} Big Data in {COVID-19} Scenario", journal = j-TOMM, volume = "18", number = "2s", pages = "117:1--117:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3529395", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3529395", abstract = "With the rapid development of information technology and the spread of Corona Virus Disease 2019 (COVID-19), the government and urban managers are looking \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Anand:2022:CSD, author = "Ashima Anand and Amit Kumar Singh", title = "A Comprehensive Study of Deep Learning-based Covert Communication", journal = j-TOMM, volume = "18", number = "2s", pages = "118:1--118:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3508365", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3508365", abstract = "Deep learning-based methods have been popular in multimedia analysis tasks, including classification, detection, segmentation, and so on. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2022:EAC, author = "Haotian Xu and Xiaobo Jin and Qiufeng Wang and Amir Hussain and Kaizhu Huang", title = "Exploiting Attention-Consistency Loss For Spatial-Temporal Stream Action Recognition", journal = j-TOMM, volume = "18", number = "2s", pages = "119:1--119:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3538749", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538749", abstract = "Currently, many action recognition methods mostly consider the information from spatial streams. We propose a new perspective inspired by the human visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Salim:2022:PED, author = "Sara Salim and Nour Moustafa and Benjamin Turnbull and Imran Razzak", title = "Perturbation-enabled Deep Federated Learning for Preserving {Internet of Things}-based Social Networks", journal = j-TOMM, volume = "18", number = "2s", pages = "120:1--120:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3537899", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3537899", abstract = "Federated Learning (FL), as an emerging form of distributed machine learning (ML), can protect participants' private data from being substantially disclosed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bi:2022:DTE, author = "An-Qi Bi and Xiao-Yang Tian and Shui-Hua Wang and Yu-Dong Zhang", title = "Dynamic Transfer Exemplar based Facial Emotion Recognition Model Toward Online Video", journal = j-TOMM, volume = "18", number = "2s", pages = "121:1--121:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3538385", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538385", abstract = "In this article, we focus on the dynamic facial emotion recognition from online video. We combine deep neural networks with transfer learning theory and propose a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Golmaryami:2022:SSS, author = "Marjan Golmaryami and Rahim Taheri and Zahra Pooranian and Mohammad Shojafar and Pei Xiao", title = "{SETTI}: a {Self-supervised AdvErsarial Malware DeTection ArchiTecture in an IoT} Environment", journal = j-TOMM, volume = "18", number = "2s", pages = "122:1--122:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3536425", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3536425", abstract = "In recent years, malware detection has become an active research topic in the area of Internet of Things (IoT) security. The principle is to exploit knowledge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Khan:2022:PPM, author = "Abbas Khan and Ijaz {Ul Haq} and Tanveer Hussain and Khan Muhammad and Mohammad Hijji and Muhammad Sajjad and Victor Hugo C. {De Albuquerque} and Sung Wook Baik", title = "{PMAL}: a Proxy Model Active Learning Approach for Vision Based Industrial Applications", journal = j-TOMM, volume = "18", number = "2s", pages = "123:1--123:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3534932", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3534932", abstract = "Deep Learning models' performance strongly correlate with availability of annotated data; however, massive data labeling is laborious, expensive, and error-prone when \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2022:DQN, author = "Chenyi Yang and Xiaolong Xu and Xiaokang Zhou and Lianyong Qi", title = "{Deep Q} Network-Driven Task Offloading for Efficient Multimedia Data Analysis in Edge Computing-Assisted {IoV}", journal = j-TOMM, volume = "18", number = "2s", pages = "124:1--124:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3548687", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3548687", abstract = "With the prosperity of Industry 4.0, numerous emerging industries continue to gain popularity and their market scales are expanding ceaselessly. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tiwari:2022:ODN, author = "Arti Tiwari and Millie Pant", title = "Optimized Deep-Neural Network for Content-based Medical Image Retrieval in a Brownfield {IoMT} Network", journal = j-TOMM, volume = "18", number = "2s", pages = "125:1--125:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546194", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3546194", abstract = "In this paper, a brownfield Internet of Medical Things network is introduced for imaging data that can be easily scaled out depending on the objectives, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2022:SFM, author = "Wei Huang and Yuze Zhang and Shaohua Wan", title = "A Sorting Fuzzy Min-Max Model in an Embedded System for Atrial Fibrillation Detection", journal = j-TOMM, volume = "18", number = "2s", pages = "126:1--126:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3554737", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3554737", abstract = "Atrial fibrillation detection (AFD) has attracted much attention in the field of embedded systems. In this study, we propose a sorting fuzzy min-max \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2022:ISS, author = "Xun Yang and Liang Zheng and Elisa Ricci and Meng Wang", title = "Introduction to the Special Section on Learning Representations, Similarity, and Associations in Dynamic Multimedia Environments", journal = j-TOMM, volume = "18", number = "2s", pages = "127:1--127:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3569952", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3569952", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127e", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2022:RLD, author = "Jun He and Richang Hong and Xueliang Liu and Mingliang Xu and Qianru Sun", title = "Revisiting Local Descriptor for Improved Few-Shot Classification", journal = j-TOMM, volume = "18", number = "2s", pages = "127:1--127:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3511917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3511917", abstract = "Few-shot classification studies the problem of quickly adapting a deep learner to understanding novel classes based on few support images. In this context, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiao:2022:GGL, author = "Yingying Jiao and Haipeng Chen and Runyang Feng and Haoming Chen and Sifan Wu and Yifang Yin and Zhenguang Liu", title = "{GLPose}: Global-Local Representation Learning for Human Pose Estimation", journal = j-TOMM, volume = "18", number = "2s", pages = "128:1--128:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3519305", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3519305", abstract = "Multi-frame human pose estimation is at the core of many computer vision tasks. Although state-of-the-art approaches have demonstrated remarkable results \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2022:STS, author = "Qing Han and Huiting Liu and Weidong Min and Tiemei Huang and Deyu Lin and Qi Wang", title = "{$3$D} Skeleton and Two Streams Approach to Person Re-identification Using Optimized Region Matching", journal = j-TOMM, volume = "18", number = "2s", pages = "129:1--129:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3538490", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538490", abstract = "Person re-identification (Re-ID) is a challenging and arduous task due to non-overlapping views, complex background, and uncontrollable occlusion in video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2022:RRL, author = "Xin Xu and Xin Yuan and Zheng Wang and Kai Zhang and Ruimin Hu", title = "Rank-in-Rank Loss for Person Re-identification", journal = j-TOMM, volume = "18", number = "2s", pages = "130:1--130:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3532866", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532866", abstract = "Person re-identification (re-ID) is commonly investigated as a ranking problem. However, the performance of existing re-ID models drops dramatically, when they \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:GGA, author = "Kunpeng Li and Chang Liu and Mike Stopa and Jun Amano and Yun Fu", title = "Guided Graph Attention Learning for Video-Text Matching", journal = j-TOMM, volume = "18", number = "2s", pages = "131:1--131:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3538533", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538533", abstract = "As a bridge between videos and natural languages, video-text matching has been a hot multimedia research topic in recent years. Such cross-modal retrieval \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Biondi:2022:CRC, author = "Niccol{\'o} Biondi and Federico Pernici and Matteo Bruni and Daniele Mugnai and Alberto Del Bimbo", title = "{CL$^2$R}: Compatible Lifelong Learning Representations", journal = j-TOMM, volume = "18", number = "2s", pages = "132:1--132:??", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3564786", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3564786", abstract = "In this article, we propose a method to partially mimic natural intelligence for the problem of lifelong learning representations that are compatible. We take the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2022:CIK, author = "Yonghua Pan and Zechao Li and Liyan Zhang and Jinhui Tang", title = "Causal Inference with Knowledge Distilling and Curriculum Learning for Unbiased {VQA}", journal = j-TOMM, volume = "18", number = "3", pages = "67:1--67:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3487042", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3487042", abstract = "Recently, many Visual Question Answering (VQA) models rely on the correlations between questions and answers yet neglect those between the visual information and the textual information. They would perform badly if the handled data distribute differently \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yanagi:2022:IRR, author = "Rintaro Yanagi and Ren Togo and Takahiro Ogawa and Miki Haseyama", title = "Interactive Re-ranking via Object Entropy-Guided Question Answering for Cross-Modal Image Retrieval", journal = j-TOMM, volume = "18", number = "3", pages = "68:1--68:17", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485042", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485042", abstract = "Cross-modal image-retrieval methods retrieve desired images from a query text by learning relationships between texts and images. Such a retrieval approach is one of the most effective ways of achieving the easiness of query preparation. Recent cross-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2022:SIN, author = "Qinghongya Shi and Hong-Bo Zhang and Zhe Li and Ji-Xiang Du and Qing Lei and Jing-Hua Liu", title = "Shuffle-invariant Network for Action Recognition in Videos", journal = j-TOMM, volume = "18", number = "3", pages = "69:1--69:18", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485665", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3485665", abstract = "The local key features in video are important for improving the accuracy of human action recognition. However, most end-to-end methods focus on global feature learning from videos, while few works consider the enhancement of the local information in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2022:LAS, author = "Di Yuan and Xiaojun Chang and Zhihui Li and Zhenyu He", title = "Learning Adaptive Spatial-Temporal Context-Aware Correlation Filters for {UAV} Tracking", journal = j-TOMM, volume = "18", number = "3", pages = "70:1--70:18", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3486678", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3486678", abstract = "Tracking in the unmanned aerial vehicle (UAV) scenarios is one of the main components of target-tracking tasks. Different from the target-tracking task in the general scenarios, the target-tracking task in the UAV scenarios is very challenging because of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2022:ESR, author = "Guofei Sun and Yongkang Wong and Mohan S. Kankanhalli and Xiangdong Li and Weidong Geng", title = "Enhanced {$3$D} Shape Reconstruction With Knowledge Graph of Category Concept", journal = j-TOMM, volume = "18", number = "3", pages = "71:1--71:20", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491224", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491224", abstract = "Reconstructing three-dimensional (3D) objects from images has attracted increasing attention due to its wide applications in computer vision and robotic tasks. Despite the promising progress of recent deep learning-based approaches, which directly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:DIG, author = "Jinfeng Li and Weifeng Liu and Yicong Zhou and Jun Yu and Dapeng Tao and Changsheng Xu", title = "Domain-invariant Graph for Adaptive Semi-supervised Domain Adaptation", journal = j-TOMM, volume = "18", number = "3", pages = "72:1--72:18", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3487194", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3487194", abstract = "Domain adaptation aims to generalize a model from a source domain to tackle tasks in a related but different target domain. Traditional domain adaptation algorithms assume that enough labeled data, which are treated as the prior knowledge are available in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2022:OOS, author = "Ran Shi and Jing Ma and King Ngi Ngan and Jian Xiong and Tong Qiao", title = "Objective Object Segmentation Visual Quality Evaluation: Quality Measure and Pooling Method", journal = j-TOMM, volume = "18", number = "3", pages = "73:1--73:19", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491229", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491229", abstract = "Objective object segmentation visual quality evaluation is an emergent member of the visual quality assessment family. It aims to develop an objective measure instead of a subjective survey to evaluate the object segmentation quality in agreement with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2022:CAS, author = "Linghua Zeng and Xinmei Tian", title = "{CRAR}: Accelerating Stereo Matching with Cascaded Residual Regression and Adaptive Refinement", journal = j-TOMM, volume = "18", number = "3", pages = "74:1--74:19", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3488719", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3488719", abstract = "Dense stereo matching estimates the depth for each pixel of the referenced images. Recently, deep learning algorithms have dramatically promoted the development of stereo matching. The state-of-the-art result is achieved by models adopting deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yao:2022:RGA, author = "Lingxiang Yao and Worapan Kusakunniran and Qiang Wu and Jingsong Xu and Jian Zhang", title = "Recognizing Gaits Across Walking and Running Speeds", journal = j-TOMM, volume = "18", number = "3", pages = "75:1--75:22", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3488715", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3488715", abstract = "For decades, very few methods were proposed for cross-mode (i.e., walking vs. running) gait recognition. Thus, it remains largely unexplored regarding how to recognize persons by the way they walk and run. Existing cross-mode methods handle the walking-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:IKB, author = "Qun Li and Fu Xiao and Bir Bhanu and Biyun Sheng and Richang Hong", title = "Inner Knowledge-based {Img2Doc} Scheme for Visual Question Answering", journal = j-TOMM, volume = "18", number = "3", pages = "76:1--76:21", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3489142", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3489142", abstract = "Visual Question Answering (VQA) is a research topic of significant interest at the intersection of computer vision and natural language understanding. Recent research indicates that attributes and knowledge can effectively improve performance for both \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cornia:2022:MFA, author = "Marcella Cornia and Matteo Tomei and Lorenzo Baraldi and Rita Cucchiara", title = "Matching Faces and Attributes Between the Artistic and the Real Domain: the {PersonArt} Approach", journal = j-TOMM, volume = "18", number = "3", pages = "77:1--77:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3490033", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3490033", abstract = "In this article, we present an approach for retrieving similar faces between the artistic and the real domain. The application we refer to is an interactive exhibition inside a museum, in which a visitor can take a photo of himself and search for a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yin:2022:MFL, author = "Guanghao Yin and Shouqian Sun and Dian Yu and Dejian Li and Kejun Zhang", title = "A Multimodal Framework for Large-Scale Emotion Recognition by Fusing Music and Electrodermal Activity Signals", journal = j-TOMM, volume = "18", number = "3", pages = "78:1--78:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3490686", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3490686", abstract = "Considerable attention has been paid to physiological signal-based emotion recognition in the field of affective computing. For reliability and user-friendly acquisition, electrodermal activity (EDA) has a great advantage in practical applications. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Buckchash:2022:GLG, author = "Himanshu Buckchash and Balasubramanian Raman", title = "{GraSP}: Local {Grassmannian} Spatio-Temporal Patterns for Unsupervised Pose Sequence Recognition", journal = j-TOMM, volume = "18", number = "3", pages = "79:1--79:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491227", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491227", abstract = "Many applications of action recognition, especially broad domains like surveillance or anomaly-detection, favor unsupervised methods considering that exhaustive labeling of actions is not possible. However, very limited work has happened in this domain. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2022:SSR, author = "Xiaoguang Zhu and Ye Zhu and Haoyu Wang and Honglin Wen and Yan Yan and Peilin Liu", title = "Skeleton Sequence and {RGB} Frame Based Multi-Modality Feature Fusion Network for Action Recognition", journal = j-TOMM, volume = "18", number = "3", pages = "80:1--80:24", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491228", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491228", abstract = "Action recognition has been a heated topic in computer vision for its wide application in vision systems. Previous approaches achieve improvement by fusing the modalities of the skeleton sequence and RGB video. However, such methods pose a dilemma between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chowdhury:2022:DGS, author = "Debanjan Roy Chowdhury and Sukumar Nandi and Diganta Goswami", title = "Distributed Gateway Selection for Video Streaming in {VANET} Using {IP} Multicast", journal = j-TOMM, volume = "18", number = "3", pages = "81:1--81:24", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491388", abstract = "The volume of video traffic as infotainment service over vehicular ad hoc network (VANET) has rapidly increased for past few years. Providing video streaming as VANET infotainment service is very challenging because of high mobility and heterogeneity of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alaya:2022:MVE, author = "Bechir Alaya and Lamaa Sellami", title = "Multilayer Video Encoding for {QoS} Managing of Video Streaming in {VANET} Environment", journal = j-TOMM, volume = "18", number = "3", pages = "82:1--82:19", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491433", abstract = "Efficient delivery and maintenance of the quality of service (QoS) of audio/video streams transmitted over VANETs for mobile and heterogeneous nodes are one of the major challenges in the convergence of this network type and these services. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:WPM, author = "Yike Wu and Shiwan Zhao and Ying Zhang and Xiaojie Yuan and Zhong Su", title = "When Pairs Meet Triplets: Improving Low-Resource Captioning via Multi-Objective Optimization", journal = j-TOMM, volume = "18", number = "3", pages = "83:1--83:20", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492325", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3492325", abstract = "Image captioning for low-resource languages has attracted much attention recently. Researchers propose to augment the low-resource caption dataset into (image, rich-resource language, and low-resource language) triplets and develop the dual attention \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2022:ICD, author = "Kai-Wei Yang and Yen-Yun Huang and Jen-Wei Huang and Ya-Rou Hsu and Chang-Lin Wan and Hong-Han Shuai and Li-Chun Wang and Wen-Huang Cheng", title = "Improving Crowd Density Estimation by Fusing Aerial Images and Radio Signals", journal = j-TOMM, volume = "18", number = "3", pages = "84:1--84:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492346", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3492346", abstract = "A recent line of research focuses on crowd density estimation from RGB images for a variety of applications, for example, surveillance and traffic flow control. The performance drops dramatically for low-quality images, such as occlusion, or poor light \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xia:2022:FCS, author = "Zhihua Xia and Qiuju Ji and Qi Gu and Chengsheng Yuan and Fengjun Xiao", title = "A Format-compatible Searchable Encryption Scheme for {JPEG} Images Using Bag-of-words", journal = j-TOMM, volume = "18", number = "3", pages = "85:1--85:18", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492705", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3492705", abstract = "The development of cloud computing attracts enterprises and individuals to outsource their data, such as images, to the cloud server. However, direct outsourcing causes the extensive concern of privacy leakage, as images often contain rich sensitive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Natgunanathan:2022:BBA, author = "Iynkaran Natgunanathan and Purathani Praitheeshan and Longxiang Gao and Yong Xiang and Lei Pan", title = "Blockchain-Based Audio Watermarking Technique for Multimedia Copyright Protection in Distribution Networks", journal = j-TOMM, volume = "18", number = "3", pages = "86:1--86:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492803", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3492803", abstract = "Copyright protection in multimedia protection distribution is a challenging problem. To protect multimedia data, many watermarking methods have been proposed in the literature. However, most of them cannot be used effectively in a multimedia distribution \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2022:DIE, author = "Kehua Guo and Min Hu and Sheng Ren and Fangfang Li and Jian Zhang and Haifu Guo and Xiaoyan Kui", title = "Deep Illumination-Enhanced Face Super-Resolution Network for Low-Light Images", journal = j-TOMM, volume = "18", number = "3", pages = "87:1--87:19", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495258", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3495258", abstract = "Face images are typically a key component in the fields of security and criminal investigation. However, due to lighting and shooting angles, faces taken under low-light conditions are often difficult to recognize. Face super-resolution (FSR) technology \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:SSM, author = "Xiaoming Liu and Shuo Wang and Ying Zhang and Quan Yuan", title = "Scribble-Supervised Meibomian Glands Segmentation in Infrared Images", journal = j-TOMM, volume = "18", number = "3", pages = "88:1--88:23", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3497747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3497747", abstract = "Infrared imaging is currently the most effective clinical method to evaluate the morphology of the meibomian glands (MGs) in patients. As an important indicator for monitoring the development of MG dysfunction, it is necessary to accurately measure gland-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2022:TII, author = "Kedar Nath Singh and Amit Kumar Singh", title = "Towards Integrating Image Encryption with Compression: a Survey", journal = j-TOMM, volume = "18", number = "3", pages = "89:1--89:21", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3498342", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:55 MDT 2022", bibsource = "bhttps://www.math.utah.edu/pub/tex/bib/tomccap.bib; https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/datacompression.bib", URL = "https://dl.acm.org/doi/10.1145/3498342", abstract = "As digital images are consistently generated and transmitted online, the unauthorized utilization of these images is an increasing concern that has a significant impact on both security and privacy issues; additionally, the representation of digital \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{MontenegroMarin:2022:ISI, author = "Carlos Enrique {Montenegro Marin} and Dinesh Jackson Samuel and Nallappan Gunasekaran", title = "Introduction to the Special Issue on {6G} Enabled Interactive Multimedia Communication Systems", journal = j-TOMM, volume = "18", number = "3s", pages = "133:1--133:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3567835", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3567835", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133e", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:CAP, author = "Ran Li and Wei Wei and Peinan Hao and Jian Su and Fengyuan Sun", title = "Context-aware Pseudo-true Video Interpolation at {6G} Edge", journal = j-TOMM, volume = "18", number = "3s", pages = "133:1--133:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3555313", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3555313", abstract = "In the 6G network, lots of edge devices facilitate the low-latency transmission of video. However, with limited processing and storage capabilities, the edge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alharbi:2022:NSA, author = "Abdullah Alharbi and Mohammed Aljebreen and Amr Tolba and Konstantinos A. Lizos and Saied Abd El-Atty and Farid Shawki", title = "A Normalized Slicing-assigned Virtualization Method for 6G-based Wireless Communication Systems", journal = j-TOMM, volume = "18", number = "3s", pages = "134:1--134:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546077", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3546077", abstract = "The next generation of wireless communication systems will rely on advantageous sixth-generation wireless network (6G) features and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:ISIb, author = "Yin Zhang and Iztok Humar and Jia Liu and Alireza Jolfaei", title = "Introduction to the Special Issue on Affective Services based on Representation Learning", journal = j-TOMM, volume = "18", number = "3s", pages = "135:1--135:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3567836", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3567836", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135e", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2022:DBJ, author = "Kexin Xu and Haijun Zhang and Keping Long and Jianquan Wang and Lei Sun", title = "{DRL} based Joint Affective Services Computing and Resource Allocation in {ISTN}", journal = j-TOMM, volume = "18", number = "3s", pages = "135:1--135:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3561821", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561821", abstract = "Affective services will become a research hotspot in artificial intelligence (AI) in the next decade. In this paper, a novel service paradigm combined with wireless \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:AIA, author = "Yazhou Zhang and Prayag Tiwari and Lu Rong and Rui Chen and Nojoom A. Alnajem and M. Shamim Hossain", title = "Affective Interaction: Attentive Representation Learning for Multi-Modal Sentiment Classification", journal = j-TOMM, volume = "18", number = "3s", pages = "136:1--136:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3527175", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3527175", abstract = "The recent booming of artificial intelligence (AI) applications, e.g., affective robots, human-machine interfaces, autonomous vehicles, and so on, has produced \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "136", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:BRJ, author = "Xiaoqin Wang and Chen Chen and Rushi Lan and Licheng Liu and Zhenbing Liu and Huiyu Zhou and Xiaonan Luo", title = "Binary Representation via Jointly Personalized Sparse Hashing", journal = j-TOMM, volume = "18", number = "3s", pages = "137:1--137:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3558769", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3558769", abstract = "Unsupervised hashing has attracted much attention for binary representation learning due to the requirement of economical storage and efficiency of binary \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "137", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jin:2022:AAA, author = "Xin Jin and Xinning Li and Hao Lou and Chenyu Fan and Qiang Deng and Chaoen Xiao and Shuai Cui and Amit Kumar Singh", title = "Aesthetic Attribute Assessment of Images Numerically on Mixed Multi-attribute Datasets", journal = j-TOMM, volume = "18", number = "3s", pages = "138:1--138:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3547144", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3547144", abstract = "With the continuous development of social software and multimedia technology, images have become a kind of important carrier for spreading information and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "138", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2022:SBH, author = "Jie Cao and Youquan Wang and Haicheng Tao and Xiang Guo", title = "Sensor-based Human Activity Recognition Using Graph {LSTM} and Multi-task Classification Model", journal = j-TOMM, volume = "18", number = "3s", pages = "139:1--139:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3561387", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561387", abstract = "This paper explores human activities recognition from sensor-based multi-dimensional streams. Recently, deep learning-based methods such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "139", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2022:OTV, author = "Jiawei Huang and Qichen Su and Weihe Li and Zhuoran Liu and Tao Zhang and Sen Liu and Ping Zhong and Wanchun Jiang and Jianxin Wang", title = "Opportunistic Transmission for Video Streaming over Wild {Internet}", journal = j-TOMM, volume = "18", number = "3s", pages = "140:1--140:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3488722", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3488722", abstract = "The video streaming system employs adaptive bitrate (ABR) algorithms to optimize a user's quality of experience. However, it is hard for ABR \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "140", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duanmu:2022:BQE, author = "Zhengfang Duanmu and Wentao Liu and Diqi Chen and Zhuoran Li and Zhou Wang and Yizhou Wang and Wen Gao", title = "A {Bayesian} Quality-of-Experience Model for Adaptive Streaming Videos", journal = j-TOMM, volume = "18", number = "3s", pages = "141:1--141:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3491432", abstract = "The fundamental conflict between the enormous space of adaptive streaming videos and the limited capacity for subjective experiment casts significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "141", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ignat:2022:WDI, author = "Oana Ignat and Santiago Castro and Yuhang Zhou and Jiajun Bao and Dandan Shan and Rada Mihalcea", title = "When Did It Happen? {Duration}-informed Temporal Localization of Narrated Actions in Vlogs", journal = j-TOMM, volume = "18", number = "3s", pages = "142:1--142:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495211", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3495211", abstract = "We consider the task of temporal human action localization in lifestyle vlogs. We introduce a novel dataset consisting of manual annotations of temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "142", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2022:HMU, author = "Wuzhen Shi and Shaohui Liu", title = "Hiding Message Using a Cycle Generative Adversarial Network", journal = j-TOMM, volume = "18", number = "3s", pages = "143:1--143:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495566", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3495566", abstract = "Training an image steganography is an unsupervised problem, because it is impossible to obtain an ideal supervised steganographic image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "143", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hui:2022:STC, author = "Chen Hui and Shaohui Liu and Wuzhen Shi and Feng Jiang and Debin Zhao", title = "Spatio-Temporal Context Based Adaptive Camcorder Recording Watermarking", journal = j-TOMM, volume = "18", number = "3s", pages = "144:1--144:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3503160", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3503160", abstract = "Video watermarking technology has attracted increasing attention in the past few years, and a great deal of traditional and deep learning-based methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "144", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2022:BAP, author = "Jian Zhao and Xianhui Liu and Weidong Zhao", title = "Balanced and Accurate Pseudo-Labels for Semi-Supervised Image Classification", journal = j-TOMM, volume = "18", number = "3s", pages = "145:1--145:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506711", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506711", abstract = "Image classification by semi-supervised learning has recently become a hot spot, and the Co-Training framework is an important method of semi-supervised image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "145", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Stacchio:2022:THA, author = "Lorenzo Stacchio and Alessia Angeli and Giuseppe Lisanti and Daniela Calanca and Gustavo Marfia", title = "Toward a Holistic Approach to the Socio-historical Analysis of Vernacular Photos", journal = j-TOMM, volume = "18", number = "3s", pages = "146:1--146:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3507918", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3507918", abstract = "Although one of the most popular practices in photography since the end of the 19th century, an increase in scholarly interest in family photo albums dates back to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "146", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2022:DAS, author = "Hui-Chu Xiao and Wan-Lei Zhao and Jie Lin and Yi-Geng Hong and Chong-Wah Ngo", title = "Deeply Activated Salient Region for Instance Search", journal = j-TOMM, volume = "18", number = "3s", pages = "147:1--147:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3510004", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3510004", abstract = "The performance of instance search relies heavily on the ability to locate and describe a wide variety of object instances in a video/image collection. Due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "147", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:CEC, author = "Zuquan Liu and Guopu Zhu and Feng Ding and Xiangyang Luo and Sam Kwong and Peng Li", title = "Contrast-Enhanced Color Visual Cryptography for $ (k, n) $ Threshold Schemes", journal = j-TOMM, volume = "18", number = "3s", pages = "148:1--148:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3508394", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3508394", abstract = "In traditional visual cryptography schemes (VCSs), pixel expansion remains to be an unsolved challenge. To alleviate the impact of pixel expansion, several \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "148", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:DSS, author = "Zhe Liu and Xian-Hua Han", title = "Deep Self-Supervised Hyperspectral Image Reconstruction", journal = j-TOMM, volume = "18", number = "3s", pages = "149:1--149:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3510373", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3510373", abstract = "Reconstructing a high-resolution hyperspectral (HR-HS) image via merging a low-resolution hyperspectral (LR-HS) image and a high-resolution RGB \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "149", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2022:SSD, author = "Gurinder Singh and Puneet Goyal", title = "{SDCN2}: a Shallow Densely Connected {CNN} for Multi-Purpose Image Manipulation Detection", journal = j-TOMM, volume = "18", number = "3s", pages = "150:1--150:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3510462", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3510462", abstract = "Digital image information can be easily tampered with to harm the integrity of someone. Thus, recognizing the truthfulness and processing history of an image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "150", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:SGS, author = "Yunfei Liu and Yu Li and Shaodi You and Feng Lu", title = "Semantic Guided Single Image Reflection Removal", journal = j-TOMM, volume = "18", number = "3s", pages = "151:1--151:??", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3510821", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:31 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3510821", abstract = "Reflection is common when we see through a glass window, which not only is a visual disturbance but also influences the performance of computer vision \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "151", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:IFD, author = "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun Chen and Yimin Liu", title = "Improving Feature Discrimination for Object Tracking by Structural-similarity-based Metric Learning", journal = j-TOMM, volume = "18", number = "4", pages = "90:1--90:23", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3497746", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3497746", abstract = "Existing approaches usually form the tracking task as an appearance matching procedure. However, the discrimination ability of appearance features is insufficient in these trackers, which is caused by their weak feature supervision constraints and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2022:IBP, author = "Xiaowen Huang and Jitao Sang and Changsheng Xu", title = "Image-Based Personality Questionnaire Design", journal = j-TOMM, volume = "18", number = "4", pages = "91:1--91:??", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3503489", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:32 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3503489", abstract = "This article explores the problem of image-based personality questionnaire design. Compared with the traditional text-based personality questionnaire, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hao:2022:DLL, author = "Shijie Hao and Xu Han and Yanrong Guo and Meng Wang", title = "Decoupled Low-Light Image Enhancement", journal = j-TOMM, volume = "18", number = "4", pages = "92:1--92:19", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3498341", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3498341", abstract = "The visual quality of photographs taken under imperfect lightness conditions can be degenerated by multiple factors, e.g., low lightness, imaging noise, color distortion, and so on. Current low-light image enhancement models focus on the improvement of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:AQR, author = "Yibing Liu and Yangyang Guo and Jianhua Yin and Xuemeng Song and Weifeng Liu and Liqiang Nie and Min Zhang", title = "Answer Questions with Right Image Regions: a Visual Attention Regularization Approach", journal = j-TOMM, volume = "18", number = "4", pages = "93:1--93:18", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3498340", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3498340", abstract = "Visual attention in Visual Question Answering (VQA) targets at locating the right image regions regarding the answer prediction, offering a powerful technique to promote multi-modal understanding. However, recent studies have pointed out that the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2022:DAM, author = "Yang Yu and Rongrong Ni and Wenjie Li and Yao Zhao", title = "Detection of {AI-Manipulated} Fake Faces via Mining Generalized Features", journal = j-TOMM, volume = "18", number = "4", pages = "94:1--94:23", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3499026", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3499026", abstract = "Recently, AI-manipulated face techniques have developed rapidly and constantly, which has raised new security issues in society. Although existing detection methods consider different categories of fake faces, the performance on detecting the fake faces \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cheng:2022:CMG, author = "Yuhao Cheng and Xiaoguang Zhu and Jiuchao Qian and Fei Wen and Peilin Liu", title = "Cross-modal Graph Matching Network for Image-text Retrieval", journal = j-TOMM, volume = "18", number = "4", pages = "95:1--95:23", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3499027", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3499027", abstract = "Image-text retrieval is a fundamental cross-modal task whose main idea is to learn image-text matching. Generally, according to whether there exist interactions during the retrieval process, existing image-text retrieval methods can be classified into \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dogariu:2022:GRS, author = "Mihai Dogariu and Liviu-Daniel {\c{S}}tefan and Bogdan Andrei Boteanu and Claudiu Lamba and Bomi Kim and Bogdan Ionescu", title = "Generation of Realistic Synthetic Financial Time-series", journal = j-TOMM, volume = "18", number = "4", pages = "96:1--96:27", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501305", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3501305", abstract = "Financial markets have always been a point of interest for automated systems. Due to their complex nature, financial algorithms and fintech frameworks require vast amounts of data to accurately respond to market fluctuations. This data availability is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2022:CMS, author = "Yi Zheng and Yong Zhou and Jiaqi Zhao and Ying Chen and Rui Yao and Bing Liu and Abdulmotaleb {El Saddik}", title = "Clustering Matters: Sphere Feature for Fully Unsupervised Person Re-identification", journal = j-TOMM, volume = "18", number = "4", pages = "97:1--97:18", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501404", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3501404", abstract = "In person re-identification (Re-ID), the data annotation cost of supervised learning, is huge and it cannot adapt well to complex situations. Therefore, compared with supervised deep learning methods, unsupervised methods are more in line with actual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2022:HMB, author = "Zengming Tang and Jun Huang", title = "Harmonious Multi-branch Network for Person Re-identification with Harder Triplet Loss", journal = j-TOMM, volume = "18", number = "4", pages = "98:1--98:21", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501405", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3501405", abstract = "Recently, advances in person re-identification (Re-ID) has benefitted from use of the popular multi-branch network. However, performing feature learning in a single branch with uniform partitioning is likely to separate meaningful local regions, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2022:TCA, author = "Yifan Xu and Kekai Sheng and Weiming Dong and Baoyuan Wu and Changsheng Xu and Bao-Gang Hu", title = "Towards Corruption-Agnostic Robust Domain Adaptation", journal = j-TOMM, volume = "18", number = "4", pages = "99:1--99:16", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501800", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3501800", abstract = "Great progress has been achieved in domain adaptation in decades. Existing works are always based on an ideal assumption that testing target domains are independent and identically distributed with training target domains. However, due to unpredictable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2022:JSC, author = "Jinzhi Lin and Yun Zhang and Na Li and Hongling Jiang", title = "Joint Source-Channel Decoding of Polar Codes for {HEVC}-Based Video Streaming", journal = j-TOMM, volume = "18", number = "4", pages = "100:1--100:23", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3502208", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3502208", abstract = "Ultra High-Definition (UHD) and Virtual Reality (VR) video streaming over 5G networks are emerging, in which High-Efficiency Video Coding (HEVC) is used as source coding to compress videos more efficiently and polar code is used as channel coding to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2022:DES, author = "Yongrui Li and Zengfu Wang and Jun Yu", title = "Densely Enhanced Semantic Network for Conversation System in Social Media", journal = j-TOMM, volume = "18", number = "4", pages = "101:1--101:24", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501799", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3501799", abstract = "The human-computer conversation system is a significant application in the field of multimedia. To select an appropriate response, retrieval-based systems model the matching between the dialogue history and response candidates. However, most of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2022:NCN, author = "Kai Lin and Chuanmin Jia and Xinfeng Zhang and Shanshe Wang and Siwei Ma and Wen Gao", title = "{NR-CNN}: Nested-Residual Guided {CNN} In-loop Filtering for Video Coding", journal = j-TOMM, volume = "18", number = "4", pages = "102:1--102:22", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3502723", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3502723", abstract = "Recently, deep learning for video coding, such as deep predictive coding, deep transform coding, and deep in-loop filtering, has been an emerging research area. The coding gain of hybrid coding framework could be extensively promoted by the data-driven \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dai:2022:FFS, author = "Hanbin Dai and Hailin Shi and Wu Liu and Linfang Wang and Yinglu Liu and Tao Mei", title = "{FasterPose}: a Faster Simple Baseline for Human Pose Estimation", journal = j-TOMM, volume = "18", number = "4", pages = "103:1--103:16", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3503464", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3503464", abstract = "The performance of human pose estimation depends on the spatial accuracy of keypoint localization. Most existing methods pursue the spatial accuracy through learning the high-resolution (HR) representation from input images. By the experimental analysis, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Man:2022:SAR, author = "Xin Man and Deqiang Ouyang and Xiangpeng Li and Jingkuan Song and Jie Shao", title = "Scenario-Aware Recurrent Transformer for Goal-Directed Video Captioning", journal = j-TOMM, volume = "18", number = "4", pages = "104:1--104:17", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3503927", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3503927", abstract = "Fully mining visual cues to aid in content understanding is crucial for video captioning. However, most state-of-the-art video captioning methods are limited to generating captions purely based on straightforward information while ignoring the scenario \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2022:OCC, author = "Tianjun Zhang and Hao Deng and Lin Zhang and Shengjie Zhao and Xiao Liu and Yicong Zhou", title = "Online Correction of Camera Poses for the Surround-view System: a Sparse Direct Approach", journal = j-TOMM, volume = "18", number = "4", pages = "106:1--106:24", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505252", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3505252", abstract = "The surround-view module is an indispensable component of a modern advanced driving assistance system. By calibrating the intrinsics and extrinsics of the surround-view cameras accurately, a top-down surround-view can be generated from raw fisheye images. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2022:MGB, author = "Quan Wang and Sheng Li and Xinpeng Zhang and Guorui Feng", title = "Multi-granularity Brushstrokes Network for Universal Style Transfer", journal = j-TOMM, volume = "18", number = "4", pages = "107:1--107:17", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506710", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506710", abstract = "Neural style transfer has been developed in recent years, where both performance and efficiency have been greatly improved. However, most existing methods do not transfer the brushstrokes information of style images well. In this article, we address this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Saxena:2022:PSU, author = "Nidhi Saxena and Balasubramanian Raman", title = "Pansharpening Scheme Using Bi-dimensional Empirical Mode Decomposition and Neural Network", journal = j-TOMM, volume = "18", number = "4", pages = "108:1--108:22", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506709", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506709", abstract = "The pansharpening is a combination of multispectral (MS) and panchromatic (PAN) images that produce a high-spatial-spectral-resolution MS images. In multiresolution analysis-based pansharpening schemes, some spatial and spectral distortions are found. It \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2022:EEH, author = "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun Chen and Jingjing Zhang", title = "An End-to-end Heterogeneous Restraint Network for {RGB-D} Cross-modal Person Re-identification", journal = j-TOMM, volume = "18", number = "4", pages = "109:1--109:22", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506708", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506708", abstract = "The RGB-D cross-modal person re-identification (re-id) task aims to identify the person of interest across the RGB and depth image modes. The tremendous discrepancy between these two modalities makes this task difficult to tackle. Few researchers pay \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2022:SRP, author = "Caixia Liu and Dehui Kong and Shaofan Wang and Jinghua Li and Baocai Yin", title = "A Spatial Relationship Preserving Adversarial Network for {$3$D} Reconstruction from a Single Depth View", journal = j-TOMM, volume = "18", number = "4", pages = "110:1--110:22", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506733", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506733", abstract = "Recovering the geometry of an object from a single depth image is an interesting yet challenging problem. While previous learning based approaches have demonstrated promising performance, they don't fully explore spatial relationships of objects, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ren:2022:EES, author = "Ruyong Ren and Shaozhang Niu and Hua Ren and Shubin Zhang and Tengyue Han and Xiaohai Tong", title = "{ESRNet}: Efficient Search and Recognition Network for Image Manipulation Detection", journal = j-TOMM, volume = "18", number = "4", pages = "111:1--111:23", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506853", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506853", abstract = "With the widespread use of smartphones and the rise of intelligent software, we can manipulate captured photos anytime and anywhere, so the fake photos finally obtained look ``Real.'' If these intelligent operation methods are maliciously applied to our \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duan:2022:NMS, author = "Mingxing Duan and Kenli Li and Jiayan Deng and Bin Xiao and Qi Tian", title = "A Novel Multi-Sample Generation Method for Adversarial Attacks", journal = j-TOMM, volume = "18", number = "4", pages = "112:1--112:21", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506852", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3506852", abstract = "Deep learning models are widely used in daily life, which bring great convenience to our lives, but they are vulnerable to attacks. How to build an attack system with strong generalization ability to test the robustness of deep learning systems is a hot \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2022:ATA, author = "Yang Guo and Wei Gao and Siwei Ma and Ge Li", title = "Accelerating Transform Algorithm Implementation for Efficient Intra Coding of {8K UHD} Videos", journal = j-TOMM, volume = "18", number = "4", pages = "113:1--113:20", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3507970", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Mar 24 08:21:57 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3507970", abstract = "Real-time ultra-high-definition (UHD) video applications have attracted much attention, where the encoder side urgently demands the high-throughput two-dimensional (2D) transform hardware implementation for the latest video coding standards. This article \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shao:2023:SIP, author = "Xuan Shao and Ying Shen and Lin Zhang and Shengjie Zhao and Dandan Zhu and Yicong Zhou", title = "{SLAM} for Indoor Parking: a Comprehensive Benchmark Dataset and a Tightly Coupled Semantic Framework", journal = j-TOMM, volume = "19", number = "1", pages = "1:1--1:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3510856", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3510856", abstract = "For the task of autonomous indoor parking, various Visual-Inertial Simultaneous Localization And Mapping (SLAM) systems are expected to achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sharma:2023:WBA, author = "Prasen Sharma and Ira Bisht and Arijit Sur", title = "Wavelength-based Attributed Deep Neural Network for Underwater Image Restoration", journal = j-TOMM, volume = "19", number = "1", pages = "2:1--2:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3511021", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3511021", abstract = "Background: Underwater images, in general, suffer from low contrast and high color distortions due to the non-uniform attenuation of the light as it \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:SCE, author = "Jie Li and Ling Han and Chong Zhang and Qiyue Li and Zhi Liu", title = "Spherical Convolution Empowered Viewport Prediction in 360 Video Multicast with Limited {FoV} Feedback", journal = j-TOMM, volume = "19", number = "1", pages = "3:1--3:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3511603", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3511603", abstract = "Field of view (FoV) prediction is critical in 360-degree video multicast, which is a key component of the emerging virtual reality and augmented reality \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Le:2023:ASN, author = "Thi-Ngoc-Hanh Le and Chih-Kuo Yeh and Ying-Chi Lin and Tong-Yee Lee", title = "Animating Still Natural Images Using Warping", journal = j-TOMM, volume = "19", number = "1", pages = "4:1--4:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3511894", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3511894", abstract = "From a single still image, a looping video could be generated by imparting subtle motion to objects in the image. The results are a hybrid of photography and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiong:2023:RRD, author = "Lizhi Xiong and Xiao Han and Ching-Nung Yang and Zhihua Xia", title = "{RDH-DES}: Reversible Data Hiding over Distributed Encrypted-Image Servers Based on Secret Sharing", journal = j-TOMM, volume = "19", number = "1", pages = "5:1--5:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3512797", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3512797", abstract = "Reversible Data Hiding in Encrypted Image (RDHEI) schemes may redistribute the data hiding procedure to other parties and can preserve privacy of the cover \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhen:2023:TAO, author = "Peining Zhen and Shuqi Wang and Suming Zhang and Xiaotao Yan and Wei Wang and Zhigang Ji and Hai-Bao Chen", title = "Towards Accurate Oriented Object Detection in Aerial Images with Adaptive Multi-level Feature Fusion", journal = j-TOMM, volume = "19", number = "1", pages = "6:1--6:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3513133", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3513133", abstract = "Detecting objects in aerial images is a long-standing and challenging problem since the objects in aerial images vary dramatically in size and orientation. Most existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2023:DSD, author = "Yue Song and Hao Tang and Nicu Sebe and Wei Wang", title = "Disentangle Saliency Detection into Cascaded Detail Modeling and Body Filling", journal = j-TOMM, volume = "19", number = "1", pages = "7:1--7:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3513134", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3513134", abstract = "Salient object detection has been long studied to identify the most visually attractive objects in images/videos. Recently, a growing amount of approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:BSG, author = "Yong Zhang and Yingwei Pan and Ting Yao and Rui Huang and Tao Mei and Chang-Wen Chen", title = "Boosting Scene Graph Generation with Visual Relation Saliency", journal = j-TOMM, volume = "19", number = "1", pages = "8:1--8:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3514041", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3514041", abstract = "The scene graph is a symbolic data structure that comprehensively describes the objects and visual relations in a visual scene, while ignoring the inherent perceptual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:BVL, author = "Jingwen Chen and Jianjie Luo and Yingwei Pan and Yehao Li and Ting Yao and Hongyang Chao and Tao Mei", title = "Boosting Vision-and-Language Navigation with Direction Guiding and Backtracing", journal = j-TOMM, volume = "19", number = "1", pages = "9:1--9:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3526024", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3526024", abstract = "Vision-and-Language Navigation (VLN) has been an emerging and fast-developing research topic, where an embodied agent is required to navigate in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rao:2023:DPZ, author = "Yunbo Rao and Ziqiang Yang and Shaoning Zeng and Qifeng Wang and Jiansu Pu", title = "Dual Projective Zero-Shot Learning Using Text Descriptions", journal = j-TOMM, volume = "19", number = "1", pages = "10:1--10:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3514247", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3514247", abstract = "Zero-shot learning (ZSL) aims to recognize image instances of unseen classes solely based on the semantic descriptions of the unseen classes. In this field, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2023:MVS, author = "Hang Yu and Chilam Cheang and Yanwei Fu and Xiangyang Xue", title = "Multi-view Shape Generation for a {$3$D} Human-like Body", journal = j-TOMM, volume = "19", number = "1", pages = "11:1--11:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3514248", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3514248", abstract = "Three-dimensional (3D) human-like body reconstruction via a single RGB image has attracted significant research attention recently. Most of the existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:WST, author = "Weidong Chen and Guorong Li and Xinfeng Zhang and Shuhui Wang and Liang Li and Qingming Huang", title = "Weakly Supervised Text-based Actor-Action Video Segmentation by Clip-level Multi-instance Learning", journal = j-TOMM, volume = "19", number = "1", pages = "12:1--12:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3514250", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3514250", abstract = "In real-world scenarios, it is common that a video contains multiple actors and their activities. Selectively localizing one specific actor and its action spatially and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shen:2023:QFC, author = "Feihong Shen and Jun Liu", title = "Quantum {Fourier} Convolutional Network", journal = j-TOMM, volume = "19", number = "1", pages = "13:1--13:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3514249", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3514249", abstract = "The neural network and quantum computing are both significant and appealing fields, with their interactive disciplines promising for large-scale computing tasks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:BBT, author = "Xiaotian Wu and Peng Yao", title = "{Boolean}-based Two-in-One Secret Image Sharing by Adaptive Pixel Grouping", journal = j-TOMM, volume = "19", number = "1", pages = "14:1--14:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3517140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3517140", abstract = "The two-in-one secret image sharing (TiOSIS) technique is a hybrid scheme that protects a secret image by combining visual cryptography (VCS) and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yadav:2023:DML, author = "Ashima Yadav and Dinesh Kumar Vishwakarma", title = "A Deep Multi-level Attentive Network for Multimodal Sentiment Analysis", journal = j-TOMM, volume = "19", number = "1", pages = "15:1--15:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3517139", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3517139", abstract = "Multimodal sentiment analysis has attracted increasing attention with broad application prospects. Most of the existing methods have focused on a single modality, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2023:NGA, author = "Honghao Gao and Baobin Dai and Huaikou Miao and Xiaoxian Yang and Ramon J. Duran Barroso and Hussain Walayat", title = "A Novel {GAPG} Approach to Automatic Property Generation for Formal Verification: The {GAN} Perspective", journal = j-TOMM, volume = "19", number = "1", pages = "16:1--16:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3517154", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3517154", abstract = "Formal methods have been widely used to support software testing to guarantee correctness and reliability. For example, model checking technology \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:LSS, author = "Pengyi Zhang and Huanzhang Dou and Wenhu Zhang and Yuhan Zhao and Zequn Qin and Dongping Hu and Yi Fang and Xi Li", title = "A Large-Scale Synthetic Gait Dataset Towards in-the-Wild Simulation and Comparison Study", journal = j-TOMM, volume = "19", number = "1", pages = "17:1--17:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3517199", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3517199", abstract = "Gait recognition has a rapid development in recent years. However, current gait recognition focuses primarily on ideal laboratory scenes, leaving the gait \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2023:DAB, author = "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and Haifeng Hu", title = "Double Attention Based on Graph Attention Network for Image Multi-Label Classification", journal = j-TOMM, volume = "19", number = "1", pages = "18:1--18:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3519030", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3519030", abstract = "The task of image multi-label classification is to accurately recognize multiple objects in an input image. Most of the recent works need to leverage the label \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:ANM, author = "Xianlin Zhang and Mengling Shen and Xueming Li and Xiaojie Wang", title = "{AABLSTM}: a Novel Multi-task Based {CNN-RNN} Deep Model for Fashion Analysis", journal = j-TOMM, volume = "19", number = "1", pages = "19:1--19:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3519029", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3519029", abstract = "With the rapid growth of online commerce and fashion-related applications, visual clothing analysis and recognition has become a hotspot in computer \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:GML, author = "Deyin Liu and Lin (Yuanbo) Wu and Richang Hong and Zongyuan Ge and Jialie Shen and Farid Boussaid and Mohammed Bennamoun", title = "Generative Metric Learning for Adversarially Robust Open-world Person Re-Identification", journal = j-TOMM, volume = "19", number = "1", pages = "20:1--20:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3522714", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3522714", abstract = "The vulnerability of re-identification (re-ID) models under adversarial attacks is of significant concern as criminals may use adversarial perturbations to evade \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:BHI, author = "Shuo Wang and Huixia Ben and Yanbin Hao and Xiangnan He and Meng Wang", title = "Boosting Hyperspectral Image Classification with Dual Hierarchical Learning", journal = j-TOMM, volume = "19", number = "1", pages = "21:1--21:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3522713", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3522713", abstract = "Hyperspectral image (HSI) classification aims at predicting the pixel-wise labels in an image, where there are only a few labeled pixel samples (hard labels) for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:DUD, author = "Dayan Wu and Qi Dai and Bo Li and Weiping Wang", title = "Deep Uncoupled Discrete Hashing via Similarity Matrix Decomposition", journal = j-TOMM, volume = "19", number = "1", pages = "22:1--22:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524021", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524021", abstract = "Hashing has been drawing increasing attention in the task of large-scale image retrieval owing to its storage and computation efficiency, especially \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cheung:2023:SNA, author = "Ming Cheung and Weiwei Sun and James She and Jiantao Zhou", title = "Social Network Analytic-Based Online Counterfeit Seller Detection using User Shared Images", journal = j-TOMM, volume = "19", number = "1", pages = "23:1--23:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524135", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524135", abstract = "Selling counterfeit online has become a serious problem, especially with the advancement of social media and mobile technology. Instead of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feihong:2023:THQ, author = "Lu Feihong and Chen Hang and Li Kang and Deng Qiliang and Zhao Jian and Zhang Kaipeng and Han Hong", title = "Toward High-quality Face-Mask Occluded Restoration", journal = j-TOMM, volume = "19", number = "1", pages = "24:1--24:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524137", abstract = "Face-mask occluded restoration aims at restoring the masked region of a human face, which has attracted increasing attention in the context of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:CSL, author = "Yajing Liu and Zhiwei Xiong and Ya Li and Yuning Lu and Xinmei Tian and Zheng-Jun Zha", title = "Category-Stitch Learning for Union Domain Generalization", journal = j-TOMM, volume = "19", number = "1", pages = "25:1--25:??", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524136", abstract = "Domain generalization aims at generalizing the network trained on multiple domains to unknown but related domains. Under the assumption that different domains \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ferrari:2023:CRR, author = "Claudio Ferrari and Federico Becattini and Leonardo Galteri and Alberto {Del Bimbo}", title = "{(Compress and Restore) N}: a Robust Defense Against Adversarial Attacks on Image Classification", journal = j-TOMM, volume = "19", number = "1s", pages = "26:1--26:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524619", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524619", abstract = "Modern image classification approaches often rely on deep neural networks, which have shown pronounced weakness to adversarial examples: images \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2023:SSC, author = "Yaguang Song and Xiaoshan Yang and Changsheng Xu", title = "Self-supervised Calorie-aware Heterogeneous Graph Networks for Food Recommendation", journal = j-TOMM, volume = "19", number = "1s", pages = "27:1--27:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524618", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524618", abstract = "With the rapid development of online recipe sharing platforms, food recommendation is emerging as an important application. Although \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xue:2023:LEE, author = "Feng Xue and Tian Yang and Kang Liu and Zikun Hong and Mingwei Cao and Dan Guo and Richang Hong", title = "{LCSNet}: End-to-end Lipreading with Channel-aware Feature Selection", journal = j-TOMM, volume = "19", number = "1s", pages = "28:1--28:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524620", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524620", abstract = "Lipreading is a task of decoding the movement of the speaker's lip region into text. In recent years, lipreading methods based on deep neural network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2023:LPA, author = "Zilong Fu and Hongtao Xie and Shancheng Fang and Yuxin Wang and Mengting Xing and Yongdong Zhang", title = "Learning Pixel Affinity Pyramid for Arbitrary-Shaped Text Detection", journal = j-TOMM, volume = "19", number = "1s", pages = "29:1--29:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3524617", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524617", abstract = "Arbitrary-shaped text detection in natural images is a challenging task due to the complexity of the background and the diversity of text properties. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{CardiaNeto:2023:LSA, author = "Jo{\~a}o Baptista {Cardia Neto} and Claudio Ferrari and Aparecido {Nilceu Marana} and Stefano Berretti and Alberto {Del Bimbo}", title = "Learning Streamed Attention Network from Descriptor Images for Cross-Resolution {$3$D} Face Recognition", journal = j-TOMM, volume = "19", number = "1s", pages = "30:1--30:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3527158", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3527158", abstract = "In this article, we propose a hybrid framework for cross-resolution 3D face recognition which utilizes a Streamed Attention Network (SAN) that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2023:TMM, author = "Xin Huang", title = "On Teaching Mode of {MTI} Translation Workshop Based on {IPT} Corpus for {Tibetan} Areas of {China}", journal = j-TOMM, volume = "19", number = "1s", pages = "31:1--31:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3527173", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3527173", abstract = "With the technological turn of applied research in translation, increasing attention has been paid to the teaching of translation technology. This article addresses \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:MMM, author = "Liming Xu and Xianhua Zeng and Weisheng Li and Bochuan Zheng", title = "{MFGAN}: Multi-modal Feature-fusion for {CT} Metal Artifact Reduction Using {GANs}", journal = j-TOMM, volume = "19", number = "1s", pages = "32:1--32:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3528172", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3528172", abstract = "Due to the existence of metallic implants in certain patients, the Computed Tomography (CT) images from these patients are often corrupted by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2023:DIP, author = "Yuzhang Hu and Wenhan Yang and Jiaying Liu and Zongming Guo", title = "Deep Inter Prediction with Error-Corrected Auto-Regressive Network for Video Coding", journal = j-TOMM, volume = "19", number = "1s", pages = "33:1--33:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3528173", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3528173", abstract = "Modern codecs remove temporal redundancy of a video via inter prediction, i.e., searching previously coded frames for similar blocks and storing motion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:IIT, author = "Yue Li and Li Zhang and Kai Zhang", title = "{iDAM}: Iteratively Trained Deep In-loop Filter with Adaptive Model Selection", journal = j-TOMM, volume = "19", number = "1s", pages = "34:1--34:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3529107", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3529107", abstract = "As a rapid development of neural-network-based machine learning algorithms, deep learning methods are being tentatively used in a much wider range than \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jaiswal:2023:CNN, author = "Rahul Kumar Jaiswal and Rajesh Kumar Dubey", title = "{CAQoE}: a Novel No-Reference Context-aware Speech Quality Prediction Metric", journal = j-TOMM, volume = "19", number = "1s", pages = "35:1--35:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3529394", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3529394", abstract = "The quality of speech degrades while communicating over Voice over Internet Protocol applications, for example, Google Meet, Microsoft Skype, and Apple \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2023:BPP, author = "Tao Xiang and Honghong Zeng and Biwen Chen and Shangwei Guo", title = "{BMIF}: Privacy-preserving Blockchain-based Medical Image Fusion", journal = j-TOMM, volume = "19", number = "1s", pages = "36:1--36:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531016", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3531016", abstract = "Medical image fusion generates a fused image containing multiple features extracted from different source images, and it is of great help in clinical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:DDB, author = "Xiaoke Zhu and Changlong Li and Xiaopan Chen and Xinyu Zhang and Xiao-Yuan Jing", title = "Distance and Direction Based Deep Discriminant Metric Learning for Kinship Verification", journal = j-TOMM, volume = "19", number = "1s", pages = "37:1--37:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531014", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3531014", abstract = "Image-based kinship verification is an important task in computer vision and has many applications in practice, such as missing children search and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhuang:2023:OPF, author = "Weiming Zhuang and Xin Gan and Yonggang Wen and Shuai Zhang", title = "Optimizing Performance of Federated Person Re-identification: Benchmarking and Analysis", journal = j-TOMM, volume = "19", number = "1s", pages = "38:1--38:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531013", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3531013", abstract = "Increasingly stringent data privacy regulations limit the development of person re-identification (ReID) because person ReID training requires \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{DeDivitiis:2023:DFF, author = "Lavinia {De Divitiis} and Federico Becattini and Claudio Baecchi and Alberto {Del Bimbo}", title = "Disentangling Features for Fashion Recommendation", journal = j-TOMM, volume = "19", number = "1s", pages = "39:1--39:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531017", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3531017", abstract = "Online stores have become fundamental for the fashion industry, revolving around recommendation systems to suggest appropriate items to customers. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chan:2023:UFH, author = "Ka-Hou Chan and Sio-Kei Im", title = "Using Four Hypothesis Probability Estimators for {CABAC} in Versatile Video Coding", journal = j-TOMM, volume = "19", number = "1s", pages = "40:1--40:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3531015", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3531015", abstract = "This article introduces the key technologies involved in four hypothetical probability estimators for Context-based Adaptive Binary Arithmetic Coding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2023:ATD, author = "Mengqi Yuan and Bing-Kun Bao and Zhiyi Tan and Changsheng Xu", title = "Adaptive Text Denoising Network for Image Caption Editing", journal = j-TOMM, volume = "19", number = "1s", pages = "41:1--41:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532627", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532627", abstract = "Image caption editing, which aims at editing the inaccurate descriptions of the images, is an interdisciplinary task of computer vision and natural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:IQA, author = "Xiaoyu Zhang and Wei Gao and Ge Li and Qiuping Jiang and Runmin Cong", title = "Image Quality Assessment-driven Reinforcement Learning for Mixed Distorted Image Restoration", journal = j-TOMM, volume = "19", number = "1s", pages = "42:1--42:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532625", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532625", abstract = "Due to the diversity of the degradation process that is difficult to model, the recovery of mixed distorted images is still a challenging problem. The deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bai:2023:DDI, author = "Chongyang Bai and Maksim Bolonkin and Viney Regunath and V. S. Subrahmanian", title = "{DIPS}: a Dyadic Impression Prediction System for Group Interaction Videos", journal = j-TOMM, volume = "19", number = "1s", pages = "43:1--43:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532865", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532865", abstract = "We consider the problem of predicting the impression that one subject has of another in a video clip showing a group of interacting people. Our novel \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:SHL, author = "Yuqing Liu and Xinfeng Zhang and Shanshe Wang and Siwei Ma and Wen Gao", title = "Sequential Hierarchical Learning with Distribution Transformation for Image Super-Resolution", journal = j-TOMM, volume = "19", number = "1s", pages = "44:1--44:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532864", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532864", abstract = "Multi-scale design has been considered in recent image super-resolution (SR) works to explore the hierarchical feature information. Existing multi-scale \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:JJD, author = "Haidong Wang and Xuan He and Zhiyong Li and Jin Yuan and Shutao Li", title = "{JDAN}: Joint Detection and Association Network for Real-Time Online Multi-Object Tracking", journal = j-TOMM, volume = "19", number = "1s", pages = "45:1--45:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3533253", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3533253", abstract = "In the last few years, enormous strides have been made for object detection and data association, which are vital subtasks for one-stage online multi-object \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2023:NRD, author = "Mengyao Xiao and Xiaolong Li and Yao Zhao and Bin Ma and Guodong Guo", title = "A Novel Reversible Data Hiding Scheme Based on Pixel-Residual Histogram", journal = j-TOMM, volume = "19", number = "1s", pages = "46:1--46:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3534565", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3534565", abstract = "Prediction-error expansion (PEE) is the most popular reversible data hiding (RDH) technique due to its efficient capacity-distortion tradeoff. With the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:MGF, author = "Jiazhi Liu and Feng Liu", title = "Modified {$2$D}-Ghost-Free Stereoscopic Display with Depth-of-Field Effects", journal = j-TOMM, volume = "19", number = "1s", pages = "47:1--47:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3534964", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3534964", abstract = "Backward-compatible stereoscopic display, a novel display technique that can simultaneously present satisfying 3D effects to viewers with stereo \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:RAC, author = "Jingwen Chen and Yingwei Pan and Yehao Li and Ting Yao and Hongyang Chao and Tao Mei", title = "Retrieval Augmented Convolutional Encoder-decoder Networks for Video Captioning", journal = j-TOMM, volume = "19", number = "1s", pages = "48:1--48:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3539225", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3539225", abstract = "Video captioning has been an emerging research topic in computer vision, which aims to generate a natural sentence to correctly reflect the visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:CSA, author = "Guanyu Zhu and Yong Zhou and Rui Yao and Hancheng Zhu and Jiaqi Zhao", title = "Cyclic Self-attention for Point Cloud Recognition", journal = j-TOMM, volume = "19", number = "1s", pages = "49:1--49:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3538648", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3538648", abstract = "Point clouds provide a flexible geometric representation for computer vision research. However, the harsh demands for the number of input points and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:EMF, author = "Dinghao Yang and Wei Gao and Ge Li and Hui Yuan and Junhui Hou and Sam Kwong", title = "Exploiting Manifold Feature Representation for Efficient Classification of {$3$D} Point Clouds", journal = j-TOMM, volume = "19", number = "1s", pages = "50:1--50:??", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3539611", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:33 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3539611", abstract = "In this paper, we propose an efficient point cloud classification method via manifold learning based feature representation. Different from conventional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lan:2023:STS, author = "Xiaohan Lan and Yitian Yuan and Xin Wang and Zhi Wang and Wenwu Zhu", title = "A Survey on Temporal Sentence Grounding in Videos", journal = j-TOMM, volume = "19", number = "2", pages = "51:1--51:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532626", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3532626", abstract = "Temporal sentence grounding in videos (TSGV), which aims at localizing one target segment from an untrimmed video with respect to a given sentence \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qiao:2023:HPI, author = "Yu Qiao and Yuhao Liu and Ziqi Wei and Yuxin Wang and Qiang Cai and Guofeng Zhang and Xin Yang", title = "Hierarchical and Progressive Image Matting", journal = j-TOMM, volume = "19", number = "2", pages = "52:1--52:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3540201", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3540201", abstract = "Most matting research resorts to advanced semantics to achieve high-quality alpha mattes, and a direct low-level features combination is usually explored to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2023:LDS, author = "Fei Peng and Wenyan Jiang and Min Long", title = "A Low Distortion and Steganalysis-resistant Reversible Data Hiding for {$2$D} Engineering Graphics", journal = j-TOMM, volume = "19", number = "2", pages = "53:1--53:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3539661", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3539661", abstract = "To reduce the distortion resulting from the large number of crossing quantization cells and resist steganalysis, a reversible data hiding scheme for 2D engineering \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mai:2023:MGU, author = "Sijie Mai and Songlong Xing and Jiaxuan He and Ying Zeng and Haifeng Hu", title = "Multimodal Graph for Unaligned Multimodal Sequence Analysis via Graph Convolution and Graph Pooling", journal = j-TOMM, volume = "19", number = "2", pages = "54:1--54:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3542927", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3542927", abstract = "Multimodal sequence analysis aims to draw inferences from visual, language, and acoustic sequences. A majority of existing works focus on the aligned fusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2023:PLN, author = "Qi Zheng and Jianfeng Dong and Xiaoye Qu and Xun Yang and Yabing Wang and Pan Zhou and Baolong Liu and Xun Wang", title = "Progressive Localization Networks for Language-Based Moment Localization", journal = j-TOMM, volume = "19", number = "2", pages = "55:1--55:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3543857", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3543857", abstract = "This article targets the task of language-based video moment localization. The language-based setting of this task allows for an open set of target activities, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:LCE, author = "Yue Zhang and Fanghui Zhang and Yi Jin and Yigang Cen and Viacheslav Voronin and Shaohua Wan", title = "Local Correlation Ensemble with {GCN} Based on Attention Features for Cross-domain Person Re-{ID}", journal = j-TOMM, volume = "19", number = "2", pages = "56:1--56:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3542820", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3542820", abstract = "Person re-identification (Re-ID) has achieved great success in single-domain. However, it remains a challenging task to adapt a Re-ID model trained on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chakareski:2023:MWF, author = "Jacob Chakareski and Mahmudur Khan and Tanguy Ropitault and Steve Blandino", title = "Millimeter Wave and Free-space-optics for Future Dual-connectivity {6DOF} Mobile Multi-user {VR} Streaming", journal = j-TOMM, volume = "19", number = "2", pages = "57:1--57:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3544494", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3544494", abstract = "Dual-connectivity streaming is a key enabler of next-generation six Degrees Of Freedom (6DOF) Virtual Reality (VR) scene immersion. Indeed, using conventional sub-6 \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2023:IPG, author = "Yun-Shao Lin and Yi-Ching Liu and Chi-Chun Lee", title = "An Interaction-process-guided Framework for Small-group Performance Prediction", journal = j-TOMM, volume = "19", number = "2", pages = "58:1--58:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3558768", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3558768", abstract = "A small group is a fundamental interaction unit for achieving a shared goal. Group performance can be automatically predicted using computational methods to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2023:EEA, author = "Na Zheng and Xuemeng Song and Tianyu Su and Weifeng Liu and Yan Yan and Liqiang Nie", title = "Egocentric Early Action Prediction via Adversarial Knowledge Distillation", journal = j-TOMM, volume = "19", number = "2", pages = "59:1--59:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3544493", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3544493", abstract = "Egocentric early action prediction aims to recognize actions from the first-person view by only observing a partial video segment, which is challenging due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:ISR, author = "Li Wang and Ke Li and Jingjing Tang and Yuying Liang", title = "Image Super-Resolution via Lightweight Attention-Directed Feature Aggregation Network", journal = j-TOMM, volume = "19", number = "2", pages = "60:1--60:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546076", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3546076", abstract = "The advent of convolutional neural networks (CNNs) has brought substantial progress in image super-resolution (SR) reconstruction. However, most SR methods pursue \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2023:FAC, author = "Jiaying Lin and Xin Tan and Ke Xu and Lizhuang Ma and Rynson W. H. Lau", title = "Frequency-aware Camouflaged Object Detection", journal = j-TOMM, volume = "19", number = "2", pages = "61:1--61:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3545609", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3545609", abstract = "Camouflaged object detection (COD) is important as it has various potential applications. Unlike salient object detection (SOD), which tries to identify \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2023:HNR, author = "Shuang Liang and Anjie Zhu and Jiasheng Zhang and Jie Shao", title = "Hyper-node Relational Graph Attention Network for Multi-modal Knowledge Graph Completion", journal = j-TOMM, volume = "19", number = "2", pages = "62:1--62:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3545573", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3545573", abstract = "Knowledge graphs often suffer from incompleteness, and knowledge graph completion (KGC) aims at inferring the missing triplets through \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2023:LVT, author = "Yaya Shi and Haiyang Xu and Chunfeng Yuan and Bing Li and Weiming Hu and Zheng-Jun Zha", title = "Learning Video-Text Aligned Representations for Video Captioning", journal = j-TOMM, volume = "19", number = "2", pages = "63:1--63:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546828", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3546828", abstract = "Video captioning requires that the model has the abilities of video understanding, video-text alignment, and text generation. Due to the semantic gap between vision \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:NRQ, author = "Yang Yang and Yingqiu Ding and Ming Cheng and Weiming Zhang", title = "No-reference Quality Assessment for Contrast-distorted Images Based on Gray and Color-gray-difference Space", journal = j-TOMM, volume = "19", number = "2", pages = "64:1--64:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3555355", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3555355", abstract = "No-reference image quality assessment is a basic and challenging problem in the field of image processing. Among them, contrast distortion has a great impact on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:REC, author = "Jia Wang and Jingcheng Ke and Hong-Han Shuai and Yung-Hui Li and Wen-Huang Cheng", title = "Referring Expression Comprehension Via Enhanced Cross-modal Graph Attention Networks", journal = j-TOMM, volume = "19", number = "2", pages = "65:1--65:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3548688", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3548688", abstract = "Referring expression comprehension aims to localize a specific object in an image according to a given language description. It is still challenging to comprehend \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:BLL, author = "Dengyong Zhang and Pu Huang and Xiangling Ding and Feng Li and Wenjie Zhu and Yun Song and Gaobo Yang", title = "{L$^2$BEC$^2$}: Local Lightweight Bidirectional Encoding and Channel Attention Cascade for Video Frame Interpolation", journal = j-TOMM, volume = "19", number = "2", pages = "66:1--66:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3547660", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3547660", abstract = "Video frame interpolation (VFI) is of great importance for many video applications, yet it is still challenging even in the era of deep learning. Some existing VFI models \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:PBI, author = "Yushu Zhang and Qing Tan and Shuren Qi and Mingfu Xue", title = "{PRNU}-based Image Forgery Localization with Deep Multi-scale Fusion", journal = j-TOMM, volume = "19", number = "2", pages = "67:1--67:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3548689", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3548689", abstract = "Photo-response non-uniformity (PRNU), as a class of device fingerprint, plays a key role in the forgery detection/localization for visual media. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dong:2023:SEG, author = "Shanshan Dong and Tianzi Niu and Xin Luo and Wu Liu and Xinshun Xu", title = "Semantic Embedding Guided Attention with Explicit Visual Feature Fusion for Video Captioning", journal = j-TOMM, volume = "19", number = "2", pages = "68:1--68:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550276", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550276", abstract = "Video captioning, which bridges vision and language, is a fundamental yet challenging task in computer vision. To generate accurate and comprehensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:SBS, author = "Shunxin Xu and Ke Sun and Dong Liu and Zhiwei Xiong and Zheng-Jun Zha", title = "Synergy between Semantic Segmentation and Image Denoising via Alternate Boosting", journal = j-TOMM, volume = "19", number = "2", pages = "69:1--69:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3548459", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3548459", abstract = "The capability of image semantic segmentation may be deteriorated due to the noisy input image, where image denoising prior to segmentation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2023:SSI, author = "Dan Song and Chu-Meng Zhang and Xiao-Qian Zhao and Teng Wang and Wei-Zhi Nie and Xuan-Ya Li and An-An Liu", title = "Self-supervised Image-based {$3$D} Model Retrieval", journal = j-TOMM, volume = "19", number = "2", pages = "70:1--70:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3548690", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3548690", abstract = "Image-based 3D model retrieval aims at organizing unlabeled 3D models according to the relevance to the labeled 2D images. With easy accessibility of 2D images and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nousias:2023:DSM, author = "Stavros Nousias and Gerasimos Arvanitis and Aris Lalos and Konstantinos Moustakas", title = "Deep Saliency Mapping for {$3$D} Meshes and Applications", journal = j-TOMM, volume = "19", number = "2", pages = "71:1--71:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550073", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550073", abstract = "Nowadays, three-dimensional (3D) meshes are widely used in various applications in different areas (e.g., industry, education, entertainment and safety). The 3D \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:TNR, author = "Yun Liu and Xiaohua Yin and Zuliang Wan and Guanghui Yue and Zhi Zheng", title = "Toward A No-reference Omnidirectional Image Quality Evaluation by Using Multi-perceptual Features", journal = j-TOMM, volume = "19", number = "2", pages = "72:1--72:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3549544", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3549544", abstract = "Compared to ordinary images, omnidirectional image (OI) usually has a broader view and a higher resolution, and image quality assessment (IQA) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:RIE, author = "Hua Wu and Xin Li and Gang Wang and Guang Cheng and Xiaoyan Hu", title = "Resolution Identification of Encrypted Video Streaming Based on {HTTP/2} Features", journal = j-TOMM, volume = "19", number = "2", pages = "73:1--73:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3551891", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3551891", abstract = "With the inevitable dominance of video traffic on the Internet, Internet service providers (ISP) are striving to deliver video streaming with high quality. Video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qin:2023:QEC, author = "Qipu Qin and Cheolkon Jung", title = "Quality Enhancement of Compressed $ 360$-Degree Videos Using Viewport-based Deep Neural Networks", journal = j-TOMM, volume = "19", number = "2", pages = "74:1--74:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3551641", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3551641", abstract = "360-degree video provides omnidirectional views by a bounding sphere, thus also called omnidirectional video. For omnidirectional video, people can only see specific \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2023:AIS, author = "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and Haifeng Hu", title = "Aligning Image Semantics and Label Concepts for Image Multi-Label Classification", journal = j-TOMM, volume = "19", number = "2", pages = "75:1--75:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550278", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:34 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550278", abstract = "Image multi-label classification task is mainly to correctly predict multiple object categories in the images. To capture the correlation between labels, graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jabeen:2023:RMA, author = "Summaira Jabeen and Xi Li and Muhammad Shoib Amin and Omar Bourahla and Songyuan Li and Abdul Jabbar", title = "A Review on Methods and Applications in Multimodal Deep Learning", journal = j-TOMM, volume = "19", number = "2s", pages = "76:1--76:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3545572", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3545572", abstract = "Deep Learning has implemented a wide range of applications and has become increasingly popular in recent years. The goal of multimodal deep learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2023:IRG, author = "Sophie C. C. Sun and Yongkang Zhao and Fang-Wei Fu and Yawei Ren", title = "Improved Random Grid-based Cheating Prevention Visual Cryptography Using Latin Square", journal = j-TOMM, volume = "19", number = "2s", pages = "77:1--77:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550275", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550275", abstract = "Visual cryptography scheme is a method of encrypting secret image into n noiselike shares. The secret image can be reconstructed by stacking \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dong:2023:VFI, author = "Jiong Dong and Kaoru Ota and Mianxiong Dong", title = "Video Frame Interpolation: a Comprehensive Survey", journal = j-TOMM, volume = "19", number = "2s", pages = "78:1--78:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3556544", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3556544", abstract = "Video Frame Interpolation (VFI) is a fascinating and challenging problem in the computer vision (CV) field, aiming to generate non-existing frames \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2023:DKP, author = "Gaofeng Cao and Fei Zhou and Kanglin Liu and Anjie Wang and Leidong Fan", title = "A Decoupled Kernel Prediction Network Guided by Soft Mask for Single Image {HDR} Reconstruction", journal = j-TOMM, volume = "19", number = "2s", pages = "79:1--79:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550277", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550277", abstract = "Recent works on single image high dynamic range (HDR) reconstruction fail to hallucinate plausible textures, resulting in information missing and artifacts \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:PCQ, author = "Yipeng Liu and Qi Yang and Yiling Xu and Le Yang", title = "Point Cloud Quality Assessment: Dataset Construction and Learning-based No-reference Metric", journal = j-TOMM, volume = "19", number = "2s", pages = "80:1--80:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3550274", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550274", abstract = "Full-reference (FR) point cloud quality assessment (PCQA) has achieved impressive progress in recent years. However, in many cases, obtaining the reference \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:PAC, author = "Cheng Xu and Zejun Chen and Jiajie Mai and Xuemiao Xu and Shengfeng He", title = "Pose- and Attribute-consistent Person Image Synthesis", journal = j-TOMM, volume = "19", number = "2s", pages = "81:1--81:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3554739", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3554739", abstract = "Person Image Synthesis aims at transferring the appearance of the source person image into a target pose. Existing methods cannot handle large pose variations and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Park:2023:SCQ, author = "Jae Hyun Park and Sanghoon Kim and Joo Chan Lee and Jong Hwan Ko", title = "Scalable Color Quantization for Task-centric Image Compression", journal = j-TOMM, volume = "19", number = "2s", pages = "82:1--82:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3551389", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3551389", abstract = "Conventional image compression techniques targeted for the perceptual quality are not generally optimized for classification tasks using deep neural networks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Puig:2023:FFP, author = "Joan Manuel Marqu{\`e}s Puig and Helena Rif{\`a}-Pous and Samia Oukemeni", title = "From False-Free to Privacy-Oriented Communitarian Microblogging Social Networks", journal = j-TOMM, volume = "19", number = "2s", pages = "83:1--83:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3555354", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3555354", abstract = "Online Social Networks (OSNs) have gained enormous popularity in recent years. They provide a dynamic platform for sharing content (text messages or \ldots{} ) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2023:QGP, author = "Yiming Tang and Yi Yu", title = "Query-Guided Prototype Learning with Decoder Alignment and Dynamic Fusion in Few-Shot Segmentation", journal = j-TOMM, volume = "19", number = "2s", pages = "84:1--84:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3555314", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3555314", abstract = "Few-shot segmentation aims to segment objects belonging to a specific class under the guidance of a few annotated examples. Most existing approaches follow \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:MCM, author = "Zhiming Liu and Kai Niu and Zhiqiang He", title = "{ML-CookGAN}: Multi-Label Generative Adversarial Network for Food Image Generation", journal = j-TOMM, volume = "19", number = "2s", pages = "85:1--85:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3554738", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3554738", abstract = "Generating food images from recipe and ingredient information can be applied to many tasks such as food recommendation, recipe development, and health \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alwaely:2023:GGB, author = "Basheer Alwaely and Charith Abhayaratne", title = "{GHOSM}: Graph-based Hybrid Outline and Skeleton Modelling for Shape Recognition", journal = j-TOMM, volume = "19", number = "2s", pages = "86:1--86:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3554922", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3554922", abstract = "An efficient and accurate shape detection model plays a major role in many research areas. With the emergence of more complex shapes in real-life applications, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jonna:2023:DDK, author = "Sankaraganesh Jonna and Moushumi Medhi and Rajiv Ranjan Sahay", title = "{Distill-DBDGAN}: Knowledge Distillation and Adversarial Learning Framework for Defocus Blur Detection", journal = j-TOMM, volume = "19", number = "2s", pages = "87:1--87:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3557897", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3557897", abstract = "Defocus blur detection (DBD) aims to segment the blurred regions from a given image affected by defocus blur. It is a crucial pre-processing step for various \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ding:2023:BRD, author = "Xuewei Ding and Yingwei Pan and Yehao Li and Ting Yao and Dan Zeng and Tao Mei", title = "Boosting Relationship Detection in Images with Multi-Granular Self-Supervised Learning", journal = j-TOMM, volume = "19", number = "2s", pages = "88:1--88:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3556978", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3556978", abstract = "Visual and spatial relationship detection in images has been a fast-developing research topic in the multimedia field, which learns to recognize the semantic/spatial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chu:2023:RLT, author = "Binfei Chu and Yiting Lin and Bineng Zhong and Zhenjun Tang and Xianxian Li and Jing Wang", title = "Robust Long-Term Tracking via Localizing Occluders", journal = j-TOMM, volume = "19", number = "2s", pages = "89:1--89:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3557896", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3557896", abstract = "Occlusion is known as one of the most challenging factors in long-term tracking because of its unpredictable shape. Existing works devoted into the design of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:CPG, author = "Huisi Wu and Zhaoze Wang and Zhuoying Li and Zhenkun Wen and Jing Qin", title = "Context Prior Guided Semantic Modeling for Biomedical Image Segmentation", journal = j-TOMM, volume = "19", number = "2s", pages = "90:1--90:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3558520", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3558520", abstract = "Most state-of-the-art deep networks proposed for biomedical image segmentation are developed based on U-Net. While remarkable success has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:OBM, author = "Jun Wu and Tianliang Zhu and Jiahui Zhu and Tianyi Li and Chunzhi Wang", title = "A Optimized {BERT} for Multimodal Sentiment Analysis", journal = j-TOMM, volume = "19", number = "2s", pages = "91:1--91:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566126", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3566126", abstract = "Sentiment analysis of one modality (e.g., text or image) has been broadly studied. However, not much attention has been paid to the sentiment analysis of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:PTM, author = "Yongzong Xu and Zhijing Yang and Tianshui Chen and Kai Li and Chunmei Qing", title = "Progressive Transformer Machine for Natural Character Reenactment", journal = j-TOMM, volume = "19", number = "2s", pages = "92:1--92:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3559107", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3559107", abstract = "Character reenactment aims to control a target person's full-head movement by a driving monocular sequence that is made up of the driving character video. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tan:2023:IVV, author = "Chong Hong Tan and Koksheik Wong and Vishnu Monn Baskaran and Kiki Adhinugraha and David Taniar", title = "Is it Violin or {Viola}? {Classifying} the Instruments' Music Pieces using Descriptive Statistics", journal = j-TOMM, volume = "19", number = "2s", pages = "93:1--93:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563218", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3563218", abstract = "Classifying music pieces based on their instrument sounds is pivotal for analysis and application purposes. Given its importance, techniques using machine learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2023:ESM, author = "KN Singh and OP Singh and Amit Kumar Singh and Amrit Kumar Agrawal", title = "{EiMOL}: a Secure Medical Image Encryption Algorithm based on Optimization and the {Lorenz} System", journal = j-TOMM, volume = "19", number = "2s", pages = "94:1--94:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561513", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561513", abstract = "Nowadays, the demand for digital images from different intelligent devices and sensors has dramatically increased in smart healthcare. Due to advanced \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qiao:2023:UUE, author = "Ziteng Qiao and Dianxi Shi and Xiaodong Yi and Yanyan Shi and Yuhui Zhang and Yangyang Liu", title = "{UEFPN}: Unified and Enhanced Feature Pyramid Networks for Small Object Detection", journal = j-TOMM, volume = "19", number = "2s", pages = "95:1--95:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561824", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561824", abstract = "Object detection models based on feature pyramid networks have made significant progress in general object detection. However, small object detection is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:DLB, author = "Linwei Zhu and Yun Zhang and Na Li and Gangyi Jiang and Sam Kwong", title = "Deep Learning-Based Intra Mode Derivation for Versatile Video Coding", journal = j-TOMM, volume = "19", number = "2s", pages = "96:1--96:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563699", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3563699", abstract = "In intra coding, Rate Distortion Optimization (RDO) is performed to achieve the optimal intra mode from a pre-defined candidate list. The optimal intra mode is also \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2023:LEI, author = "Donghuo Zeng and Jianming Wu and Gen Hattori and Rong Xu and Yi Yu", title = "Learning Explicit and Implicit Dual Common Subspaces for Audio-visual Cross-modal Retrieval", journal = j-TOMM, volume = "19", number = "2s", pages = "97:1--97:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3564608", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3564608", abstract = "Audio-visual tracks in video contain rich semantic information with potential in many applications and research. Since the audio-visual data have inconsistent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2023:RTI, author = "Qiqi Gao and Jie Li and Tiejun Zhao and Yadong Wang", title = "Real-time Image Enhancement with Attention Aggregation", journal = j-TOMM, volume = "19", number = "2s", pages = "98:1--98:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3564607", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3564607", abstract = "Image enhancement has stimulated significant research works over the past years for its great application potential in video conferencing scenarios. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:TVB, author = "Yucheng Zhu and Xiongkuo Min and Dandan Zhu and Guangtao Zhai and Xiaokang Yang and Wenjun Zhang and Ke Gu and Jiantao Zhou", title = "Toward Visual Behavior and Attention Understanding for Augmented 360 Degree Videos", journal = j-TOMM, volume = "19", number = "2s", pages = "99:1--99:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565024", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3565024", abstract = "Augmented reality (AR) overlays digital content onto reality. In an AR system, correct and precise estimations of user visual fixations and head movements can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mei:2023:MSS, author = "Haiyang Mei and Letian Yu and Ke Xu and Yang Wang and Xin Yang and Xiaopeng Wei and Rynson W. H. Lau", title = "Mirror Segmentation via Semantic-aware Contextual Contrasted Feature Learning", journal = j-TOMM, volume = "19", number = "2s", pages = "100:1--100:??", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566127", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:35 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3566127", abstract = "Mirrors are everywhere in our daily lives. Existing computer vision systems do not consider mirrors, and hence may get confused by the reflected content \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:PSN, author = "Yi Zhang and Fang-Yi Chao and Wassim Hamidouche and Olivier Deforges", title = "{PAV-SOD}: a New Task towards Panoramic Audiovisual Saliency Detection", journal = j-TOMM, volume = "19", number = "3", pages = "101:1--101:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565267", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3565267", abstract = "Object-level audiovisual saliency detection in 360${}^\circ $ panoramic real-life dynamic scenes is important for exploring and modeling human perception in immersive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2023:TDW, author = "Chi Xie and Zikun Zhuang and Shengjie Zhao and Shuang Liang", title = "Temporal Dropout for Weakly Supervised Action Localization", journal = j-TOMM, volume = "19", number = "3", pages = "102:1--102:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567827", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3567827", abstract = "Weakly supervised action localization is a challenging problem in video understanding and action recognition. Existing models usually formulate the training \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2023:MBR, author = "Yangyang Guo and Liqiang Nie and Harry Cheng and Zhiyong Cheng and Mohan Kankanhalli and Alberto {Del Bimbo}", title = "On Modality Bias Recognition and Reduction", journal = j-TOMM, volume = "19", number = "3", pages = "103:1--103:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565266", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3565266", abstract = "Making each modality in multi-modal data contribute is of vital importance to learning a versatile multi-modal model. Existing methods, however, are often \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:CTC, author = "Kang Xu and Weixin Li and Xia Wang and Xiaoyan Hu and Ke Yan and Xiaojie Wang and Xuan Dong", title = "{CUR} Transformer: a Convolutional Unbiased Regional Transformer for Image Denoising", journal = j-TOMM, volume = "19", number = "3", pages = "104:1--104:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566125", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3566125", abstract = "Image denoising is a fundamental problem in computer vision and multimedia computation. Non-local filters are effective for image denoising. But existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2023:BPL, author = "Wenxin Huang and Xuemei Jia and Xian Zhong and Xiao Wang and Kui Jiang and Zheng Wang", title = "Beyond the Parts: Learning Coarse-to-Fine Adaptive Alignment Representation for Person Search", journal = j-TOMM, volume = "19", number = "3", pages = "105:1--105:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565886", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3565886", abstract = "Person search is a time-consuming computer vision task that entails locating and recognizing query people in scenic pictures. Body components are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "105", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2023:DAP, author = "Hongchuan Yu and Mengqing Huang and Jian Jun Zhang", title = "Domain Adaptation Problem in Sketch Based Image Retrieval", journal = j-TOMM, volume = "19", number = "3", pages = "106:1--106:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565368", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3565368", abstract = "In this article, we present two algorithms that discover the discriminative structures of sketches, given pairs of sketches and photos in sketch-based image retrieval \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2023:TIF, author = "Han Yan and Haijun Zhang and Jianyang Shi and Jianghong Ma and Xiaofei Xu", title = "Toward Intelligent Fashion Design: a Texture and Shape Disentangled Generative Adversarial Network", journal = j-TOMM, volume = "19", number = "3", pages = "107:1--107:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567596", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3567596", abstract = "Texture and shape in fashion, constituting essential elements of garments, characterize the body and surface of the fabric and outline the silhouette of clothing, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dou:2023:MTP, author = "Peng Dou and Ying Zeng and Zhuoqun Wang and Haifeng Hu", title = "Multiple Temporal Pooling Mechanisms for Weakly Supervised Temporal Action Localization", journal = j-TOMM, volume = "19", number = "3", pages = "108:1--108:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3567828", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3567828", abstract = "Recent action localization works learn in a weakly supervised manner to avoid the expensive cost of human labeling. Those works are mostly based on the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:MSE, author = "Lei Li and Zhiyuan Zhou and Suping Wu and Yongrong Cao", title = "Multi-scale Edge-guided Learning for {$3$D} Reconstruction", journal = j-TOMM, volume = "19", number = "3", pages = "109:1--109:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3568678", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3568678", abstract = "Single-view three-dimensional (3D) object reconstruction has always been a long-term challenging task. Objects with complex topologies are hard to accurately \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:LFR, author = "Zhengxue Wang and Guangwei Gao and Juncheng Li and Hui Yan and Hao Zheng and Huimin Lu", title = "Lightweight Feature De-redundancy and Self-calibration Network for Efficient Image Super-resolution", journal = j-TOMM, volume = "19", number = "3", pages = "110:1--110:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569900", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3569900", abstract = "In recent years, thanks to the inherent powerful feature representation and learning abilities of the convolutional neural network (CNN), deep CNN-steered single \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2023:FTF, author = "Zhijie Huang and Jun Sun and Xiaopeng Guo", title = "{FastCNN}: Towards Fast and Accurate Spatiotemporal Network for {HEVC} Compressed Video Enhancement", journal = j-TOMM, volume = "19", number = "3", pages = "111:1--111:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569583", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3569583", abstract = "Deep neural networks have achieved remarkable success in HEVC compressed video quality enhancement. However, most existing multiframe-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:DPS, author = "Xiaohan Wang and Linchao Zhu and Fei Wu and Yi Yang", title = "A Differentiable Parallel Sampler for Efficient Video Classification", journal = j-TOMM, volume = "19", number = "3", pages = "112:1--112:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569584", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3569584", abstract = "It is crucial to sample a small portion of relevant frames for efficient video classification. The existing methods mainly develop hand-designed sampling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:TFE, author = "Junjie Li and Jin Yuan and Zhiyong Li", title = "{TP-FER}: an Effective Three-phase Noise-tolerant Recognizer for Facial Expression Recognition", journal = j-TOMM, volume = "19", number = "3", pages = "113:1--113:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570329", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3570329", abstract = "Single-label facial expression recognition (FER), which aims to classify single expression for facial images, usually suffers from the label noisy and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2023:LEF, author = "Baojin Huang and Zhongyuan Wang and Guangcheng Wang and Zhen Han and Kui Jiang", title = "Local Eyebrow Feature Attention Network for Masked Face Recognition", journal = j-TOMM, volume = "19", number = "3", pages = "114:1--114:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3569943", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3569943", abstract = "During the COVID-19 coronavirus epidemic, wearing masks has become increasingly popular. Traditional occlusion face recognition algorithms are almost \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:ESI, author = "Bin-Cheng Yang and Gangshan Wu", title = "Efficient Single-image Super-resolution Using Dual path Connections with Multiple scale Learning", journal = j-TOMM, volume = "19", number = "3", pages = "115:1--115:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570164", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3570164", abstract = "Deep convolutional neural networks have been demonstrated to be effective for single-image super-resolution in recent years. On the one hand, residual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2023:AAM, author = "Wei Zhou and Yanke Hou and Dihu Chen and Haifeng Hu and Tao Su", title = "Attention-Augmented Memory Network for Image Multi-Label Classification", journal = j-TOMM, volume = "19", number = "3", pages = "116:1--116:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570166", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3570166", abstract = "The purpose of image multi-label classification is to predict all the object categories presented in an image. Some recent works exploit graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "116", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hui:2023:MGC, author = "Shuaixiong Hui and Qiang Guo and Xiaoyu Geng and Caiming Zhang", title = "Multi-Guidance {CNNs} for Salient Object Detection", journal = j-TOMM, volume = "19", number = "3", pages = "117:1--117:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570507", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3570507", abstract = "Feature refinement and feature fusion are two key steps in convolutional neural networks-based salient object detection (SOD). In this article, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xing:2023:PPI, author = "Kai Xing and Tao Li and Xuanhan Wang", title = "{ProposalVLAD} with Proposal-Intra Exploring for Temporal Action Proposal Generation", journal = j-TOMM, volume = "19", number = "3", pages = "118:1--118:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571747", abstract = "Temporal action proposal generation aims to localize temporal segments of human activities in videos. Current boundary-based proposal generation methods can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2023:DUK, author = "Hao Tang and Lei Ding and Songsong Wu and Bin Ren and Nicu Sebe and Paolo Rota", title = "Deep Unsupervised Key Frame Extraction for Efficient Video Classification", journal = j-TOMM, volume = "19", number = "3", pages = "119:1--119:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571735", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571735", abstract = "Video processing and analysis have become an urgent task, as a huge amount of videos (e.g., YouTube, Hulu) are uploaded online every day. The extraction of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:ERI, author = "Ling Zhang and Chengjiang Long and Xiaolong Zhang and Chunxia Xiao", title = "Exploiting Residual and Illumination with {GANs} for Shadow Detection and Shadow Removal", journal = j-TOMM, volume = "19", number = "3", pages = "120:1--120:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571745", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571745", abstract = "Residual image and illumination estimation have been proven to be helpful for image enhancement. In this article, we propose a general framework, called RI-GAN, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:DRI, author = "Yushu Zhang and Nuo Chen and Shuren Qi and Mingfu Xue and Zhongyun Hua", title = "Detection of Recolored Image by Texture Features in Chrominance Components", journal = j-TOMM, volume = "19", number = "3", pages = "121:1--121:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571076", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571076", abstract = "Image recoloring is an emerging editing technique that can change the color style of an image by modifying pixel values without altering the original image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xue:2023:HFF, author = "Han Xue and Jun Ling and Anni Tang and Li Song and Rong Xie and Wenjun Zhang", title = "High-Fidelity Face Reenactment Via Identity-Matched Correspondence Learning", journal = j-TOMM, volume = "19", number = "3", pages = "122:1--122:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571857", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571857", abstract = "Face reenactment aims to generate an animation of a source face using the poses and expressions from a target face. Although recent methods have made \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:PHD, author = "Haozhe Chen and Hang Zhou and Jie Zhang and Dongdong Chen and Weiming Zhang and Kejiang Chen and Gang Hua and Nenghai Yu", title = "Perceptual Hashing of Deep Convolutional Neural Networks for Model Copy Detection", journal = j-TOMM, volume = "19", number = "3", pages = "123:1--123:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572777", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572777", abstract = "In recent years, many model intellectual property (IP) proof methods for IP protection have been proposed, such as model watermarking and model \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Duan:2023:MGL, author = "Wei Duan and Yi Yu and Xulong Zhang and Suhua Tang and Wei Li and Keizo Oyama", title = "Melody Generation from Lyrics with Local Interpretability", journal = j-TOMM, volume = "19", number = "3", pages = "124:1--124:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572031", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572031", abstract = "Melody generation aims to learn the distribution of real melodies to generate new melodies conditioned on lyrics, which has been a very interesting topic in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:TFG, author = "Shiguang Liu and Huixin Wang", title = "Talking Face Generation via Facial Anatomy", journal = j-TOMM, volume = "19", number = "3", pages = "125:1--125:??", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571746", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571746", abstract = "To generate the corresponding talking face from a speech audio and a face image, it is essential to match the variations in the facial appearance with the speech audio \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2023:TIA, author = "Zengri Zeng and Baokang Zhao and Han-Chieh Chao and Ilsun You and Kuo-Hui Yeh and Weizhi Meng", title = "Towards Intelligent Attack Detection Using {DNA} Computing", journal = j-TOMM, volume = "19", number = "3s", pages = "126:1--126:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561057", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561057", abstract = "In recent years, frequent network attacks have seriously threatened the interests and security of humankind. To address this threat, many detection methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:DCB, author = "Jinxia Wang and Rui Chen and Zhihan Lv", title = "{DNA} Computing-Based Multi-Source Data Storage Model in Digital Twins", journal = j-TOMM, volume = "19", number = "3s", pages = "127:1--127:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561823", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561823", abstract = "The work aims to study the application of Deoxyribonucleic Acid (DNA) multi-source data storage in Digital Twins (DT). Through the investigation of the research \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ahmed:2023:DBC, author = "Fawad Ahmed and Muneeb Ur Rehman and Jawad Ahmad and Muhammad Shahbaz Khan and Wadii Boulila and Gautam Srivastava and Jerry Chun-Wei Lin and William J. Buchanan", title = "A {DNA} Based Colour Image Encryption Scheme Using A Convolutional Autoencoder", journal = j-TOMM, volume = "19", number = "3s", pages = "128:1--128:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570165", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3570165", abstract = "With the advancement in technology, digital images can easily be transmitted and stored over the Internet. Encryption is used to avoid illegal interception of digital \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Menon:2023:EEM, author = "Vignesh V. Menon and Hadi Amirpour and Mohammad Ghanbari and Christian Timmerer", title = "{EMES}: Efficient Multi-encoding Schemes for {HEVC}-based Adaptive Bitrate Streaming", journal = j-TOMM, volume = "19", number = "3s", pages = "129:1--129:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575659", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3575659", abstract = "In HTTP Adaptive Streaming (HAS), videos are encoded at multiple bitrates and spatial resolutions (i.e., representations ) to adapt to the heterogeneity of network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:VAC, author = "Jiwei Zhang and Yi Yu and Suhua Tang and Jianming Wu and Wei Li", title = "Variational Autoencoder with {CCA} for Audio-Visual Cross-modal Retrieval", journal = j-TOMM, volume = "19", number = "3s", pages = "130:1--130:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575658", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3575658", abstract = "Cross-modal retrieval is to utilize one modality as a query to retrieve data from another modality, which has become a popular topic in information \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Le:2023:SAV, author = "Thi-Ngoc-Hanh Le and Ya-Hsuan Chen and Tong-Yee Lee", title = "Structure-aware Video Style Transfer with Map Art", journal = j-TOMM, volume = "19", number = "3s", pages = "131:1--131:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572030", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572030", abstract = "Changing the style of an image/video while preserving its content is a crucial criterion to access a new neural style transfer algorithm. However, it is very challenging to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2023:PMT, author = "Sirui Zhao and Hongyu Jiang and Hanqing Tao and Rui Zha and Kun Zhang and Tong Xu and Enhong Chen", title = "{PEDM}: a Multi-task Learning Model for Persona-aware Emoji-embedded Dialogue Generation", journal = j-TOMM, volume = "19", number = "3s", pages = "132:1--132:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571819", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571819", abstract = "As a vivid and linguistic symbol, Emojis have become a prevailing medium interspersed in text-based communication (e.g., social media and chit-chat) to express \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hung:2023:FCN, author = "Heyu Huang and Runmin Cong and Lianhe Yang and Ling Du and Cong Wang and Sam Kwong", title = "Feedback Chain Network for Hippocampus Segmentation", journal = j-TOMM, volume = "19", number = "3s", pages = "133:1--133:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571744", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3571744", abstract = "The hippocampus plays a vital role in the diagnosis and treatment of many neurological disorders. Recent years, deep learning technology has made \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yao:2023:CRA, author = "Xuanrong Yao and Xin Wang and Yue Liu and Wenwu Zhu", title = "Continual Recognition with Adaptive Memory Update", journal = j-TOMM, volume = "19", number = "3s", pages = "134:1--134:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3573202", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3573202", abstract = "Class incremental continual learning aims to improve the ability of modern classification models to continually recognize new classes without forgetting the previous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:AAM, author = "Jingyao Wang and Luntian Mou and Lei Ma and Tiejun Huang and Wen Gao", title = "{AMSA}: Adaptive Multimodal Learning for Sentiment Analysis", journal = j-TOMM, volume = "19", number = "3s", pages = "135:1--135:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572915", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572915", abstract = "Efficient recognition of emotions has attracted extensive research interest, which makes new applications in many fields possible, such as human-computer \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2023:JAC, author = "Shaoning Zeng and Yunbo Rao and Bob Zhang and Yong Xu", title = "Joint Augmented and Compressed Dictionaries for Robust Image Classification", journal = j-TOMM, volume = "19", number = "3s", pages = "136:1--136:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572910", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572910", abstract = "Dictionary-based Classification (DC) has been a promising learning theory in multimedia computing. Previous studies focused on learning a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "136", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wanyan:2023:DSG, author = "Yuyang Wanyan and Xiaoshan Yang and Xuan Ma and Changsheng Xu", title = "Dual Scene Graph Convolutional Network for Motivation Prediction", journal = j-TOMM, volume = "19", number = "3s", pages = "137:1--137:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572914", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572914", abstract = "Humans can easily infer the motivations behind human actions from only visual data by comprehensively analyzing the complex context information and utilizing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "137", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lei:2023:LUD, author = "Fei Lei and Zhongqi Cao and Yuning Yang and Yibo Ding and Cong Zhang", title = "Learning the User's Deeper Preferences for Multi-modal Recommendation Systems", journal = j-TOMM, volume = "19", number = "3s", pages = "138:1--138:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3573010", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3573010", abstract = "Recommendation system plays an important role in the rapid development of micro-video sharing platform. Micro-video has rich modal features, such as visual, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "138", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2023:FDP, author = "Xuehu Yan and Longlong Li and Lei Sun and Jia Chen and Shudong Wang", title = "Fake and Dishonest Participant Immune Secret Image Sharing", journal = j-TOMM, volume = "19", number = "4", pages = "139:1--139:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572842", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572842", abstract = "Secret image sharing (SIS) has received increased attention from the research community because of its usefulness in multiparty secure computing, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "139", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:SCF, author = "Song Yang and Qiang Li and Wenhui Li and Xuan-Ya Li and Ran Jin and Bo Lv and Rui Wang and Anan Liu", title = "Semantic Completion and Filtration for Image-Text Retrieval", journal = j-TOMM, volume = "19", number = "4", pages = "140:1--140:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572844", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572844", abstract = "Image-text retrieval is a vital task in computer vision and has received growing attention, since it connects cross-modality data. It comes with the critical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "140", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2023:MSK, author = "Xuan Ma and Xiaoshan Yang and Changsheng Xu", title = "Multi-Source Knowledge Reasoning Graph Network for Multi-Modal Commonsense Inference", journal = j-TOMM, volume = "19", number = "4", pages = "141:1--141:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3573201", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3573201", abstract = "As a crucial part of natural language processing, event-centered commonsense inference task has attracted increasing attention. With a given observed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "141", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:APA, author = "Shangxi Wu and Jitao Sang and Kaiyuan Xu and Jiaming Zhang and Jian Yu", title = "Attention, Please! {Adversarial} Defense via Activation Rectification and Preservation", journal = j-TOMM, volume = "19", number = "4", pages = "142:1--142:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572843", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572843", abstract = "This study provides a new understanding of the adversarial attack problem by examining the correlation between adversarial attack and visual attention change. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "142", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:CSA, author = "Kan Wang and Changxing Ding and Jianxin Pang and Xiangmin Xu", title = "Context Sensing Attention Network for Video-based Person Re-identification", journal = j-TOMM, volume = "19", number = "4", pages = "143:1--143:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3573203", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3573203", abstract = "Video-based person re-identification (ReID) is challenging due to the presence of various interferences in video frames. Recent approaches handle this problem \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "143", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:SSL, author = "Wenjing Wang and Lilang Lin and Zejia Fan and Jiaying Liu", title = "Semi-supervised Learning for {Mars} Imagery Classification and Segmentation", journal = j-TOMM, volume = "19", number = "4", pages = "144:1--144:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572916", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572916", abstract = "With the progress of Mars exploration, numerous Mars image data are being collected and need to be analyzed. However, due to the severe train-test gap \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "144", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:DDD, author = "Hui Liu and Shanshan Li and Jicheng Zhu and Kai Deng and Meng Liu and Liqiang Nie", title = "{DDIFN}: a Dual-discriminator Multi-modal Medical Image Fusion Network", journal = j-TOMM, volume = "19", number = "4", pages = "145:1--145:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3574136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3574136", abstract = "Multi-modal medical image fusion is a long-standing important research topic that can obtain informative medical images and assist doctors diagnose and treat \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "145", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2023:DGD, author = "Xintian Wu and Huanyu Wang and Yiming Wu and Xi Li", title = "{D$^3$T-GAN}: Data-Dependent Domain Transfer {GANs} for Image Generation with Limited Data", journal = j-TOMM, volume = "19", number = "4", pages = "146:1--146:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3576858", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3576858", abstract = "As an important and challenging problem, image generation with limited data aims at generating realistic images through training a GAN model given few samples. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "146", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:NLA, author = "Dandan Zhu and Xuan Shao and Qiangqiang Zhou and Xiongkuo Min and Guangtao Zhai and Xiaokang Yang", title = "A Novel Lightweight Audio-visual Saliency Model for Videos", journal = j-TOMM, volume = "19", number = "4", pages = "147:1--147:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3576857", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3576857", abstract = "Audio information has not been considered an important factor in visual attention models regardless of many psychological studies that have shown the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "147", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Abdussalam:2023:NNC, author = "Amr Abdussalam and Zhongfu Ye and Ammar Hawbani and Majjed Al-Qatf and Rashid Khan", title = "{NumCap}: a Number-controlled Multi-caption Image Captioning Network", journal = j-TOMM, volume = "19", number = "4", pages = "148:1--148:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3576927", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3576927", abstract = "Image captioning is a promising task that attracted researchers in the last few years. Existing image captioning models are primarily trained to generate one \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "148", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:DML, author = "Hao Liu and Zhaoyu Yan and Bing Liu and Jiaqi Zhao and Yong Zhou and Abdulmotaleb {El Saddik}", title = "Distilled Meta-learning for Multi-Class Incremental Learning", journal = j-TOMM, volume = "19", number = "4", pages = "149:1--149:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3576045", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3576045", abstract = "Meta-learning approaches have recently achieved promising performance in multi-class incremental learning. However, meta-learners still suffer from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "149", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2023:GAT, author = "Jin Yuan and Shikai Chen and Yao Zhang and Zhongchao Shi and Xin Geng and Jianping Fan and Yong Rui", title = "Graph Attention Transformer Network for Multi-label Image Classification", journal = j-TOMM, volume = "19", number = "4", pages = "150:1--150:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578518", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578518", abstract = "Multi-label classification aims to recognize multiple objects or attributes from images. The key to solving this issue relies on effectively characterizing the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "150", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hou:2023:UUI, author = "Guojia Hou and Yuxuan Li and Huan Yang and Kunqian Li and Zhenkuan Pan", title = "{UID2021}: an Underwater Image Dataset for Evaluation of No-Reference Quality Assessment Metrics", journal = j-TOMM, volume = "19", number = "4", pages = "151:1--151:??", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578584", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Jun 22 10:29:37 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578584", abstract = "Achieving subjective and objective quality assessment of underwater images is of high significance in underwater visual perception and image/video processing. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "151", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Carlsson:2023:CUS, author = "Niklas Carlsson and Derek Eager", title = "Cross-User Similarities in Viewing Behavior for 360${}^\circ $ Video and Caching Implications", journal = j-TOMM, volume = "19", number = "5", pages = "152:1--152:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3507917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3507917", abstract = "The demand and usage of 360${}^\circ $ video services are expected to increase. However, despite these services being highly bandwidth intensive, not much is known about the potential value that basic bandwidth saving techniques such as server or edge-network on-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "152", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:EEH, author = "Ziqiang Li and Pengfei Xia and Xue Rui and Bin Li", title = "Exploring the Effect of High-frequency Components in {GANs} Training", journal = j-TOMM, volume = "19", number = "5", pages = "153:1--153:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578585", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578585", abstract = "Generative Adversarial Networks (GANs) have the ability to generate images that are visually indistinguishable from real images. However, recent studies have revealed that generated and real images share significant differences in the frequency domain. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "153", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yin:2023:FFM, author = "Haibing Yin and Hongkui Wang and Li Yu and Junhui Liang and Guangtao Zhai", title = "Feedforward and Feedback Modulations Based Foveated {JND} Estimation for Images", journal = j-TOMM, volume = "19", number = "5", pages = "154:1--154:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579094", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579094", abstract = "The just noticeable difference (JND) reveals the key characteristic of visual perception, which has been widely used in many perception-based image and video applications. Nevertheless, the modulatory mechanism of the human visual system (HVS) has not \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "154", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:MID, author = "Taocun Yang and Yaping Huang and Yanlin Xie and Junbo Liu and Shengchun Wang", title = "{MixOOD}: Improving Out-of-distribution Detection with Enhanced Data Mixup", journal = j-TOMM, volume = "19", number = "5", pages = "155:1--155:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578935", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578935", abstract = "Detecting out-of-distribution (OOD) inputs for deep learning models is a critical task when models are deployed in real-world environments. Recently, a large number of works have been dedicated to tackling the OOD detection problem. One of the most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "155", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2023:MLC, author = "Hao Wei and Rui Chen", title = "A Multi-Level Consistency Network for High-Fidelity Virtual Try-On", journal = j-TOMM, volume = "19", number = "5", pages = "156:1--156:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3580500", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580500", abstract = "The 2D virtual try-on task aims to transfer a target clothing image to the corresponding region of a person image. Although an extensive amount of research has been conducted due to its immense applications, this task still remains a great challenge to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "156", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hao:2023:FGT, author = "Jiachang Hao and Haifeng Sun and Pengfei Ren and Yiming Zhong and Jingyu Wang and Qi Qi and Jianxin Liao", title = "Fine-Grained Text-to-Video Temporal Grounding from Coarse Boundary", journal = j-TOMM, volume = "19", number = "5", pages = "157:1--157:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579825", abstract = "Text-to-video temporal grounding aims to locate a target video moment that semantically corresponds to the given sentence query in an untrimmed video. In this task, fully supervised works require text descriptions for each event along with its temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "157", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:DLH, author = "Weixin Li and Tiantian Cao and Chang Liu and Xue Tian and Ya Li and Xiaojie Wang and Xuan Dong", title = "Dual-Lens {HDR} using Guided {$3$D} Exposure {CNN} and Guided Denoising Transformer", journal = j-TOMM, volume = "19", number = "5", pages = "158:1--158:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579167", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579167", abstract = "We study the high dynamic range (HDR) imaging problem in dual-lens systems. Existing methods usually treat the HDR imaging problem as an image fusion problem and the HDR result is estimated by fusing the aligned short exposure image and long exposure \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "158", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2023:HHF, author = "Xin Yang and Hengrui Li and Xiaochuan Li and Tao Li", title = "{HIFGAN}: a High-Frequency Information-Based Generative Adversarial Network for Image Super-Resolution", journal = j-TOMM, volume = "19", number = "5", pages = "159:1--159:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578934", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578934", abstract = "Since the neural network was introduced into the super-resolution (SR) field, many SR deep models have been proposed and have achieved excellent results. However, there are two main drawbacks: one is that the methods based on the best peak-signal-to-noise \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "159", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:DMO, author = "Yang Li", title = "Detection of Moving Object Using Superpixel Fusion Network", journal = j-TOMM, volume = "19", number = "5", pages = "160:1--160:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579998", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579998", abstract = "Moving object detection is still a challenging task in complex scenes. The existing methods based on deep learning mainly use U-Nets and have achieved amazing results. However, they ignore the local continuity between pixels. In order to solve this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "160", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2023:BTO, author = "Yingwei Pan and Yehao Li and Ting Yao and Tao Mei", title = "Bottom-up and Top-down Object Inference Networks for Image Captioning", journal = j-TOMM, volume = "19", number = "5", pages = "161:1--161:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3580366", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580366", abstract = "A bottom-up and top-down attention mechanism has led to the revolutionizing of image captioning techniques, which enables object-level attention for multi-step reasoning over all the detected objects. However, when humans describe an image, they often \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "161", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2023:MMK, author = "Duoduo Feng and Xiangteng He and Yuxin Peng", title = "{MKVSE}: Multimodal Knowledge Enhanced Visual-semantic Embedding for Image-text Retrieval", journal = j-TOMM, volume = "19", number = "5", pages = "162:1--162:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3580501", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580501", abstract = "Image-text retrieval aims to take the text (image) query to retrieve the semantically relevant images (texts), which is fundamental and critical in the search system, online shopping, and social network. Existing works have shown the effectiveness of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "162", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2023:BTG, author = "Mengyi Zhao and Hao Tang and Pan Xie and Shuling Dai and Nicu Sebe and Wei Wang", title = "Bidirectional Transformer {GAN} for Long-term Human Motion Prediction", journal = j-TOMM, volume = "19", number = "5", pages = "163:1--163:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579359", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579359", abstract = "The mainstream motion prediction methods usually focus on short-term prediction, and their predicted long-term motions often fall into an average pose, i.e., the freezing forecasting problem [ 27 ]. To mitigate this problem, we propose a novel Bidirectional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "163", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:RVS, author = "Jian Wang and Qiang Ling and Peiyan Li", title = "Robust Video Stabilization based on Motion Decomposition", journal = j-TOMM, volume = "19", number = "5", pages = "164:1--164:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3580498", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 07:03:55 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580498", abstract = "Video stabilization aims to eliminate camera jitter and improve the visual experience of shaky videos. Video stabilization methods often ignore the active movement of the foreground objects and the camera, and may result in distortion and over-smoothing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "164", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Franti:2023:DPC, author = "Pasi Fr{\"a}nti and Nancy Fazal", title = "Design Principles for Content Creation in Location-Based Games", journal = j-TOMM, volume = "19", number = "5s", pages = "165:1--165:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3583689", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3583689", abstract = "Location-based games have been around since 2000 across various fields, including education, health, and entertainment. The main challenge facing such games \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "165", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2023:VNB, author = "Chenchi Zhang and Wenbo Ma and Jun Xiao and Hanwang Zhang and Jian Shao and Yueting Zhuang and Long Chen", title = "{VL-NMS}: Breaking Proposal Bottlenecks in Two-stage Visual-language Matching", journal = j-TOMM, volume = "19", number = "5s", pages = "166:1--166:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579095", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3579095", abstract = "The prevailing framework for matching multimodal inputs is based on a two-stage process: (1) detecting proposals with an object detector and (2) matching \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "166", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mackowski:2023:MPI, author = "Micha{\l} Ma{\'c}kowski and Piotr Brzoza and Mateusz Kawulok and Rafa{\l} Meisel and Dominik Spinczyk", title = "Multimodal Presentation of Interactive Audio-Tactile Graphics Supporting the Perception of Visual Information by Blind People", journal = j-TOMM, volume = "19", number = "5s", pages = "167:1--167:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3586076", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3586076", abstract = "Due to the limitations in the perception of graphical information by blind people and the need to substitute the sense of sight with other senses, the correct use \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "167", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Man:2023:TTE, author = "Xin Man and Jie Shao and Feiyu Chen and Mingxing Zhang and Heng Tao Shen", title = "{TEVL}: Trilinear Encoder for Video-language Representation Learning", journal = j-TOMM, volume = "19", number = "5s", pages = "168:1--168:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3585388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3585388", abstract = "Pre-training model on large-scale unlabeled web videos followed by task-specific fine-tuning is a canonical approach to learning video and language \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "168", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ricci:2023:MLA, author = "Simone Ricci and Tiberio Uricchio and Alberto {Del Bimbo}", title = "Meta-learning Advisor Networks for Long-tail and Noisy Labels in Social Image Classification", journal = j-TOMM, volume = "19", number = "5s", pages = "169:1--169:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3584360", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3584360", abstract = "Deep neural networks (DNNs) for social image classification are prone to performance reduction and overfitting when trained on datasets plagued by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "169", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:LBR, author = "Chen Li and Li Song and Rong Xie and Wenjun Zhang", title = "Local Bidirection Recurrent Network for Efficient Video Deblurring with the Fused Temporal Merge Module", journal = j-TOMM, volume = "19", number = "5s", pages = "170:1--170:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587468", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587468", abstract = "Video deblurring methods exploit the correlation between consecutive blurry inputs to generate sharp frames. However, designing an effective and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "170", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Niu:2023:VCL, author = "Tian-Zi Niu and Zhen-Duo Chen and Xin Luo and Peng-Fei Zhang and Zi Huang and Xin-Shun Xu", title = "Video Captioning by Learning from Global Sentence and Looking Ahead", journal = j-TOMM, volume = "19", number = "5s", pages = "171:1--171:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587252", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587252", abstract = "Video captioning aims to automatically generate natural language sentences describing the content of a video. Although encoder-decoder-based models \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "171", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:GAE, author = "Yang Wang and Bo Dong and Ke Xu and Haiyin Piao and Yufei Ding and Baocai Yin and Xin Yang", title = "A Geometrical Approach to Evaluate the Adversarial Robustness of Deep Neural Networks", journal = j-TOMM, volume = "19", number = "5s", pages = "172:1--172:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587936", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587936", abstract = "Deep neural networks (DNNs) are widely used for computer vision tasks. However, it has been shown that deep models are vulnerable to adversarial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "172", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2023:LML, author = "Suncheng Xiang and Dahong Qian and Mengyuan Guan and Binjie Yan and Ting Liu and Yuzhuo Fu and Guanjie You", title = "Less Is More: Learning from Synthetic Data with Fine-Grained Attributes for Person Re-Identification", journal = j-TOMM, volume = "19", number = "5s", pages = "173:1--173:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588441", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588441", abstract = "Person re-identification (ReID) plays an important role in applications such as public security and video surveillance. Recently, learning from synthetic data [ 9 ], \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "173", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Siekkinen:2023:NNA, author = "Matti Siekkinen and Teemu K{\"a}m{\"a}r{\"a}inen", title = "Neural Network Assisted Depth Map Packing for Compression Using Standard Hardware Video Codecs", journal = j-TOMM, volume = "19", number = "5s", pages = "174:1--174:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588440", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588440", abstract = "Depth maps are needed by various graphics rendering and processing operations. Depth map streaming is often necessary when such operations are performed in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "174", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{vanRensburg:2023:OWD, author = "Bianca Jansen van Rensburg and Pauline Puteaux and William Puech and Jean-Pierre Pedeboy", title = "{$3$D} Object Watermarking from Data Hiding in the Homomorphic Encrypted Domain", journal = j-TOMM, volume = "19", number = "5s", pages = "175:1--175:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588573", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588573", abstract = "For over a decade, 3D objects are an increasingly popular form of media. It has become necessary and urgent to secure them during their transmission or \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "175", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:CSR, author = "Hao Liu and Xiaoshan Yang and Changsheng Xu", title = "Counterfactual Scenario-relevant Knowledge-enriched Multi-modal Emotion Reasoning", journal = j-TOMM, volume = "19", number = "5s", pages = "176:1--176:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3583690", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3583690", abstract = "Multi-modal video emotion reasoning (MERV) has recently attracted increasing attention due to its potential application in human-computer interaction. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "176", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ayoughi:2023:SCE, author = "Melika Ayoughi and Pascal Mettes and Paul Groth", title = "Self-contained Entity Discovery from Captioned Videos", journal = j-TOMM, volume = "19", number = "5s", pages = "177:1--177:??", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3583138", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Jul 3 08:37:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3583138", abstract = "This article introduces the task of visual named entity discovery in videos without the need for task-specific supervision or task-specific external knowledge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "177", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2023:CFP, author = "Jin Xie and Yanwei Pang and Jing Pan and Jing Nie and Jiale Cao and Jungong Han", title = "Complementary Feature Pyramid Network for Object Detection", journal = j-TOMM, volume = "19", number = "6", pages = "178:1--178:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3584362", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3584362", abstract = "The way of constructing a robust feature pyramid is crucial for object detection. However, existing feature pyramid methods, which aggregate multi-level features by using element-wise sum or concatenation, are inefficient to construct a robust feature \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "178", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:DCP, author = "Tianyi Wang and Harry Cheng and Kam Pui Chow and Liqiang Nie", title = "Deep Convolutional Pooling Transformer for Deepfake Detection", journal = j-TOMM, volume = "19", number = "6", pages = "179:1--179:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588574", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588574", abstract = "Recently, Deepfake has drawn considerable public attention due to security and privacy concerns in social media digital forensics. As the wildly spreading Deepfake videos on the Internet become more realistic, traditional detection techniques have failed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "179", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chan:2023:LDF, author = "Patrick P. K. Chan and Xiaoman Hu and Haorui Song and Peng Peng and Keke Chen", title = "Learning Disentangled Features for Person Re-identification under Clothes Changing", journal = j-TOMM, volume = "19", number = "6", pages = "180:1--180:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3584359", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3584359", abstract = "Clothes changing is one of the challenges in person re-identification (ReID), since clothes provide remarkable and reliable information for decision, especially when the resolution of an image is low. Variation of clothes significantly downgrades standard \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "180", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2023:CFG, author = "Rongfei Zeng and Mai Su and Ruiyun Yu and Xingwei Wang", title = "{CD$^2$}: Fine-grained {$3$D} Mesh Reconstruction with Twice Chamfer Distance", journal = j-TOMM, volume = "19", number = "6", pages = "181:1--181:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3582694", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3582694", abstract = "Monocular 3D reconstruction is to reconstruct the shape of object and its other information from a single RGB image. In 3D reconstruction, polygon mesh, with detailed surface information and low computational cost, is the most prevalent expression form \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "181", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Niu:2023:SEV, author = "Tian-Zi Niu and Shan-Shan Dong and Zhen-Duo Chen and Xin Luo and Shanqing Guo and Zi Huang and Xin-Shun Xu", title = "Semantic Enhanced Video Captioning with Multi-feature Fusion", journal = j-TOMM, volume = "19", number = "6", pages = "182:1--182:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3588572", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588572", abstract = "Video captioning aims to automatically describe a video clip with informative sentences. At present, deep learning-based models have become the mainstream for this task and achieved competitive results on public datasets. Usually, these methods leverage \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "182", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:TBV, author = "Kun Li and Jiaxiu Li and Dan Guo and Xun Yang and Meng Wang", title = "Transformer-Based Visual Grounding with Cross-Modality Interaction", journal = j-TOMM, volume = "19", number = "6", pages = "183:1--183:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587251", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587251", abstract = "This article tackles the challenging yet important task of Visual Grounding (VG), which aims to localize a visual region in the given image referred by a natural language query. Existing efforts on the VG task are twofold: (1) two-stage methods first \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "183", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2023:VPG, author = "Jiayuan Xie and Jiali Chen and Yi Cai and Qingbao Huang and Qing Li", title = "Visual Paraphrase Generation with Key Information Retained", journal = j-TOMM, volume = "19", number = "6", pages = "184:1--184:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3585010", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3585010", abstract = "Visual paraphrase generation task aims to rewrite a given image-related original sentence into a new paraphrase, where the paraphrase needs to have the same expressed meaning as the original sentence but have a difference in expression form. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "184", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2023:NVS, author = "Bingzheng Liu and Jianjun Lei and Bo Peng and Chuanbo Yu and Wanqing Li and Nam Ling", title = "Novel View Synthesis from a Single Unposed Image via Unsupervised Learning", journal = j-TOMM, volume = "19", number = "6", pages = "186:1--186:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587467", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587467", abstract = "Novel view synthesis aims to generate novel views from one or more given source views. Although existing methods have achieved promising performance, they usually require paired views with different poses to learn a pixel transformation. This article \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "186", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2023:LLI, author = "Mingliang Zhou and Hongyue Leng and Bin Fang and Tao Xiang and Xuekai Wei and Weijia Jia", title = "Low-light Image Enhancement via a Frequency-based Model with Structure and Texture Decomposition", journal = j-TOMM, volume = "19", number = "6", pages = "187:1--187:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3590965", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3590965", abstract = "This article proposes a frequency-based structure and texture decomposition model in a Retinex-based framework for low-light image enhancement and noise suppression. First, we utilize the total variation-based noise estimation to decompose the observed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "187", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:AAM, author = "Hongguang Zhu and Yunchao Wei and Yao Zhao and Chunjie Zhang and Shujuan Huang", title = "{AMC}: Adaptive Multi-expert Collaborative Network for Text-guided Image Retrieval", journal = j-TOMM, volume = "19", number = "6", pages = "188:1--188:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3584703", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3584703", abstract = "Text-guided image retrieval integrates reference image and text feedback as a multimodal query to search the image corresponding to user intention. Recent approaches employ multi-level matching, multiple accesses, or multiple subnetworks for better \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "188", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fontanini:2023:UDM, author = "Tomaso Fontanini and Luca Donati and Massimo Bertozzi and Andrea Prati", title = "Unsupervised Discovery and Manipulation of Continuous Disentangled Factors of Variation", journal = j-TOMM, volume = "19", number = "6", pages = "189:1--189:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3591358", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3591358", abstract = "Learning a disentangled representation of a distribution in a completely unsupervised way is a challenging task that has drawn attention recently. In particular, much focus has been put in separating factors of variation (i.e., attributes) within the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "189", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kumar:2023:AFS, author = "Puneet Kumar and Gaurav Bhatt and Omkar Ingle and Daksh Goyal and Balasubramanian Raman", title = "Affective Feedback Synthesis Towards Multimodal Text and Image Data", journal = j-TOMM, volume = "19", number = "6", pages = "190:1--190:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3589186", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3589186", abstract = "In this article, we have defined a novel task of affective feedback synthesis that generates feedback for input text and corresponding images in a way similar to humans responding to multimodal data. A feedback synthesis system has been proposed and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "190", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:AAS, author = "Yikun Xu and Xingxing Wei and Pengwen Dai and Xiaochun Cao", title = "{A$^2$SC}: Adversarial Attacks on Subspace Clustering", journal = j-TOMM, volume = "19", number = "6", pages = "191:1--191:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587097", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3587097", abstract = "Many studies demonstrate that supervised learning techniques are vulnerable to adversarial examples. However, adversarial threats in unsupervised learning have not drawn sufficient scholarly attention. In this article, we formally address the unexplored \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "191", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2023:DTV, author = "Xianhua Zeng and Saiyuan Chen and Yicai Xie and Tianxing Liao", title = "{3V3D}: Three-View Contextual Cross-slice Difference Three-dimensional Medical Image Segmentation Adversarial Network", journal = j-TOMM, volume = "19", number = "6", pages = "192:1--192:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3592614", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3592614", abstract = "In three-dimensional (3D) medical image segmentation, it is still a great challenge to obtain the multidimensional feature information contained in voxel images using a single view for smaller segmentation targets, and the robustness of models obtained by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "192", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Becattini:2023:VLS, author = "Federico Becattini and Pietro Bongini and Luana Bulla and Alberto {Del Bimbo} and Ludovica Marinucci and Misael Mongiov{\`\i} and Valentina Presutti", title = "{VISCOUNTH}: a Large-scale Multilingual Visual Question Answering Dataset for Cultural Heritage", journal = j-TOMM, volume = "19", number = "6", pages = "193:1--193:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3590773", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3590773", abstract = "Visual question answering has recently been settled as a fundamental multi-modal reasoning task of artificial intelligence that allows users to get information about visual content by asking questions in natural language. In the cultural heritage domain, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "193", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hsu:2023:RMS, author = "Wei-Yen Hsu and Pei-Wen Jian", title = "Recurrent Multi-scale Approximation-Guided Network for Single Image Super-Resolution", journal = j-TOMM, volume = "19", number = "6", pages = "194:1--194:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3592613", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3592613", abstract = "Single-image super-resolution (SISR) is an essential topic in computer vision applications. However, most CNN-based SISR approaches directly learn the relationship between low- and high-resolution images while ignoring the contextual texture and detail \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "194", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:HAW, author = "Bo Li and Yong Zhang and Chengyang Zhang and Xinglin Piao and Baocai Yin", title = "Hypergraph Association Weakly Supervised Crowd Counting", journal = j-TOMM, volume = "19", number = "6", pages = "195:1--195:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3594670", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3594670", abstract = "Weakly supervised crowd counting involves the regression of the number of individuals present in an image, using only the total number as the label. However, this task is plagued by two primary challenges: the large variation of head size and uneven \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "195", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tai:2023:MAS, author = "Yichun Tai and Hailin Shi and Dan Zeng and Hang Du and Yibo Hu and Zicheng Zhang and Zhijiang Zhang and Tao Mei", title = "Multi-Agent Semi-{Siamese} Training for Long-Tail and Shallow Face Learning", journal = j-TOMM, volume = "19", number = "6", pages = "196:1--196:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3594669", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3594669", abstract = "With the recent development of deep convolutional neural networks and large-scale datasets, deep face recognition has made remarkable progress and been widely used in various applications. However, unlike the existing public face datasets, in many real-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "196", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2023:PEE, author = "Rui Li and Baopeng Zhang and Wei Liu and Zhu Teng and Jianping Fan", title = "{PANet}: an End-to-end Network Based on Relative Motion for Online Multi-object Tracking", journal = j-TOMM, volume = "19", number = "6", pages = "197:1--197:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3595379", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3595379", abstract = "The popular tracking-by-detection paradigm of multi-object tracking (MOT) takes detections of each frame as the input and associates detections from one frame to another. Existing association methods based on the relative motion have attracted attention, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "197", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2023:SBD, author = "Ye Yuan and Jiawan Zhang", title = "Shot Boundary Detection Using Color Clustering and Attention Mechanism", journal = j-TOMM, volume = "19", number = "6", pages = "198:1--198:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3595923", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3595923", abstract = "Shot boundary detection (SBD) is widely used in scene segmentation, semantic analysis, and video retrieval. However, existing SBD algorithms have certain applications in video processing, but they have the following three problems. First, these algorithms \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "198", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2023:TIS, author = "Cong Huang and Xiulian Peng and Dong Liu and Yan Lu", title = "Text Image Super-Resolution Guided by Text Structure and Embedding Priors", journal = j-TOMM, volume = "19", number = "6", pages = "199:1--199:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3595924", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3595924", abstract = "We aim to super-resolve text images from unrecognizable low-resolution inputs. Existing super-resolution methods mainly learn a direct mapping from low-resolution to high-resolution images by exploring low-level features, which usually generate blurry \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "199", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2023:MLR, author = "Jie Zhu and Bo Peng and Wanqing Li and Haifeng Shen and Qingming Huang and Jianjun Lei", title = "Modeling Long-range Dependencies and Epipolar Geometry for Multi-view Stereo", journal = j-TOMM, volume = "19", number = "6", pages = "200:1--200:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3596445", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3596445", abstract = "This article proposes a network, referred to as Multi-View Stereo TRansformer (MVSTR) for depth estimation from multi-view images. By modeling long-range dependencies and epipolar geometry, the proposed MVSTR is capable of extracting dense features with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "200", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:IFD, author = "Xiumei Chen and Xiangtao Zheng and Xiaoqiang Lu", title = "Identity Feature Disentanglement for Visible-Infrared Person Re-Identification", journal = j-TOMM, volume = "19", number = "6", pages = "201:1--201:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3595183", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3595183", abstract = "Visible-infrared person re-identification (VI-ReID) task aims to retrieve persons from different spectrum cameras (i.e., visible and infrared images). The biggest challenge of VI-ReID is the huge cross-modal discrepancy caused by different imaging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "201", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shu:2023:CAP, author = "Zhenyu Shu and Ling Gao and Shun Yi and Fangyu Wu and Xin Ding and Ting Wan and Shiqing Xin", title = "Context-Aware {$3$D} Points of Interest Detection via Spatial Attention Mechanism", journal = j-TOMM, volume = "19", number = "6", pages = "202:1--202:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597026", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597026", abstract = "Detecting points of interest is a fundamental problem in 3D shape analysis and can be beneficial to various tasks in multimedia processing. Traditional learning-based detection methods usually rely on each vertex's geometric features to discriminate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "202", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2023:CCF, author = "Zhen Chen and Ming Yang and Shiliang Zhang", title = "Complementary Coarse-to-Fine Matching for Video Object Segmentation", journal = j-TOMM, volume = "19", number = "6", pages = "203:1--203:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3596496", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3596496", abstract = "Semi-supervised Video Object Segmentation (VOS) needs to establish pixel-level correspondences between a video frame and preceding segmented frames to leverage their segmentation clues. Most works rely on features at a single scale to establish those \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "203", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Srinivas:2023:CBN, author = "Kankanala Srinivas and Ashish Kumar Bhandari", title = "Context-Based Novel Histogram Bin Stretching Algorithm for Automatic Contrast Enhancement", journal = j-TOMM, volume = "19", number = "6", pages = "204:1--204:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597303", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597303", abstract = "This article presents CHBS, a novel context-based histogram bin stretching method that enhances the contrast by increasing the range of gray levels and randomness among the gray levels. It comprises image spatial contextual information and discrete cosine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "204", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2023:UDA, author = "Zhenjun Tang and Zhiyuan Chen and Zhixin Li and Bineng Zhong and Xianquan Zhang and Xinpeng Zhang", title = "Unifying Dual-Attention and {Siamese} Transformer Network for Full-Reference Image Quality Assessment", journal = j-TOMM, volume = "19", number = "6", pages = "205:1--205:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597434", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597434", abstract = "Image Quality Assessment (IQA) is a critical task of computer vision. Most Full-Reference (FR) IQA methods have limitation in the accurate prediction of perceptual qualities of the traditional distorted images and the Generative Adversarial Networks (GANs). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "205", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2023:LSR, author = "Geyu Tang and Xingyu Gao and Zhenyu Chen", title = "Learning Semantic Representation on Visual Attribute Graph for Person Re-identification and Beyond", journal = j-TOMM, volume = "19", number = "6", pages = "206:1--206:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3487044", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3487044", abstract = "Person re-identification (re-ID) aims to match pedestrian pairs captured from different cameras. Recently, various attribute-based models have been proposed to combine the pedestrian attribute as an auxiliary semantic information to learn a more \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "206", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Deng:2023:LGL, author = "Zijun Deng and Xiangteng He and Yuxin Peng", title = "{LFR-GAN}: Local Feature Refinement based Generative Adversarial Network for Text-to-Image Generation", journal = j-TOMM, volume = "19", number = "6", pages = "207:1--207:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3589002", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3589002", abstract = "Text-to-image generation aims to generate images from text descriptions. Its main challenge lies in two aspects: (1) Semantic consistency, i.e., the generated images should be semantically consistent with the input text; and (2) Visual reality, i.e., the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "207", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Du:2023:WSH, author = "Yongchao Du and Min Wang and Zhenbo Lu and Wengang Zhou and Houqiang Li", title = "Weakly Supervised Hashing with Reconstructive Cross-modal Attention", journal = j-TOMM, volume = "19", number = "6", pages = "208:1--208:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3589185", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3589185", abstract = "On many popular social websites, images are usually associated with some meta-data such as textual tags, which involve semantic information relevant to the image and can be used to supervise the representation learning for image retrieval. However, these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "208", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2023:CSC, author = "Meng Wang and Jizheng Xu and Li Zhang and Junru Li and Kai Zhang and Shiqi Wang and Siwei Ma", title = "Compressed Screen Content Image Super Resolution", journal = j-TOMM, volume = "19", number = "6", pages = "209:1--209:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3589963", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3589963", abstract = "Screen content has become one of the prominent mediums in the increasingly connected world. With the prevalence of remote collaboration and communication such as virtual conferences and online education, recent years have witnessed a dramatic increase in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "209", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2023:CUH, author = "Boqiang Xu and Jian Liang and Lingxiao He and Jinlin Wu and Chao Fan and Zhenan Sun", title = "Color-Unrelated Head-Shoulder Networks for Fine-Grained Person Re-identification", journal = j-TOMM, volume = "19", number = "6", pages = "210:1--210:??", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3599730", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:46 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3599730", abstract = "Person re-identification (re-id) attempts to match pedestrian images with the same identity across non-overlapping cameras. Existing methods usually study person re-id by learning discriminative features based on the clothing attributes (e.g., color, \ldots{})", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "210", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2024:IBC, author = "Zhenbo Xu and Hai-Miao Hu and Liu Liu and Dongping Zhang and Shifeng Zhang and Wenming Tan", title = "Instance-Based Continual Learning: a Real-World Dataset and Baseline for Fresh Recognition", journal = j-TOMM, volume = "20", number = "1", pages = "1:1--1:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3591209", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3591209", abstract = "Real-time learning on real-world data streams with temporal relations is essential for intelligent agents. However, current online Continual Learning (CL) benchmarks adopt the mini-batch setting and are composed of temporally unrelated and disjoint tasks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:RHG, author = "Xiaoping Liang and Zhenjun Tang and Zhixin Li and Mengzhu Yu and Hanyun Zhang and Xianquan Zhang", title = "Robust Hashing via Global and Local Invariant Features for Image Copy Detection", journal = j-TOMM, volume = "20", number = "1", pages = "2:1--2:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3600234", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3600234", abstract = "Robust hashing is a powerful technique for processing large-scale images. Currently, many reported image hashing schemes do not perform well in balancing the performances of discrimination and robustness, and thus they cannot efficiently detect image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sarma:2024:DID, author = "Sandipan Sarma and Arijit Sur", title = "{DiRaC-I}: Identifying Diverse and Rare Training Classes for Zero-Shot Learning", journal = j-TOMM, volume = "20", number = "1", pages = "3:1--3:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3603147", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3603147", abstract = "Zero-Shot Learning (ZSL) is an extreme form of transfer learning that aims at learning from a few ``seen classes'' to have an understanding about the ``unseen classes'' in the wild. Given a dataset in ZSL research, most existing works use a predetermined, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2024:SSJ, author = "Chengyu Zheng and Ning Song and Ruoyu Zhang and Lei Huang and Zhiqiang Wei and Jie Nie", title = "Scale-Semantic Joint Decoupling Network for Image-Text Retrieval in Remote Sensing", journal = j-TOMM, volume = "20", number = "1", pages = "4:1--4:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3603628", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3603628", abstract = "Image-text retrieval in remote sensing aims to provide flexible information for data analysis and application. In recent years, state-of-the-art methods are dedicated to ``scale decoupling'' and ``semantic decoupling'' strategies to further enhance the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:ZSS, author = "Jiankai Li and Yunhong Wang and Weixin Li", title = "Zero-shot Scene Graph Generation via Triplet Calibration and Reduction", journal = j-TOMM, volume = "20", number = "1", pages = "5:1--5:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3604284", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3604284", abstract = "Scene Graph Generation (SGG) plays a pivotal role in downstream vision-language tasks. Existing SGG methods typically suffer from poor compositional generalizations on unseen triplets. They are generally trained on incompletely annotated scene graphs that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yaqoob:2024:APT, author = "Abid Yaqoob and Gabriel-Miro Muntean", title = "Advanced Predictive Tile Selection Using Dynamic Tiling for Prioritized 360${}^\circ $ Video {VR} Streaming", journal = j-TOMM, volume = "20", number = "1", pages = "6:1--6:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3603146", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3603146", abstract = "The widespread availability of smart computing and display devices such as mobile phones, gaming consoles, laptops, and tethered/untethered head-mounted displays has fueled an increase in demand for omnidirectional (360${}^\circ $) videos. 360${}^\circ $ video applications \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:LGR, author = "Jia Wang and Hong-Han Shuai and Yung-Hui Li and Wen-Huang Cheng", title = "Language-guided Residual Graph Attention Network and Data Augmentation for Visual Grounding", journal = j-TOMM, volume = "20", number = "1", pages = "7:1--7:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3604557", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3604557", abstract = "Visual grounding is an essential task in understanding the semantic relationship between the given text description and the target object in an image. Due to the innate complexity of language and the rich semantic context of the image, it is still a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:ACN, author = "Haoran Wang and Yajie Wang and Baosheng Yu and Yibing Zhan and Chunfeng Yuan and Wankou Yang", title = "Attentional Composition Networks for Long-Tailed Human Action Recognition", journal = j-TOMM, volume = "20", number = "1", pages = "8:1--8:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3603253", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3603253", abstract = "The problem of long-tailed visual recognition has been receiving increasing research attention. However, the long-tailed distribution problem remains underexplored for video-based visual recognition. To address this issue, in this article we propose a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:SSC, author = "Zi-Chao Zhang and Zhen-Duo Chen and Zhen-Yu Xie and Xin Luo and Xin-Shun Xu", title = "{S3Mix}: Same Category Same Semantics Mixing for Augmenting Fine-grained Images", journal = j-TOMM, volume = "20", number = "1", pages = "9:1--9:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3605892", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3605892", abstract = "Data augmentation is a common technique to improve the generalization performance of models for image classification. Although methods such as Mixup and CutMix that mix images randomly are indeed instrumental in general image classification, randomly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tan:2024:TBR, author = "Mingkui Tan and Zhiquan Wen and Leyuan Fang and Qi Wu", title = "Transformer-Based Relational Inference Network for Complex Visual Relational Reasoning", journal = j-TOMM, volume = "20", number = "1", pages = "10:1--10:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3605781", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3605781", abstract = "Visual Relational Reasoning is the basis of many vision-and-language based tasks (e.g., visual question answering and referring expression comprehension). In this article, we regard the complex referring expression comprehension (c-REF) task as the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:SSL, author = "Yiming Yang and Weipeng Hu and Haifeng Hu", title = "Syncretic Space Learning Network for {NIR-VIS} Face Recognition", journal = j-TOMM, volume = "20", number = "1", pages = "11:1--11:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3607143", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3607143", abstract = "To overcome the technical bottleneck of face recognition in low-light scenarios, Near-InfraRed and VISible (NIR-VIS) heterogeneous face recognition is proposed for matching well-lit VIS faces with poorly lit NIR faces. Current cross-modal synthesis \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:DWG, author = "Chenghua Li and Zongze Li and Jing Sun and Yun Zhang and Xiaoping Jiang and Fan Zhang", title = "Dynamic Weighted Gradient Reversal Network for Visible-infrared Person Re-identification", journal = j-TOMM, volume = "20", number = "1", pages = "12:1--12:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3607535", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3607535", abstract = "Due to intra-modality variations and cross-modality discrepancy, visible-infrared person re-identification (VI Re-ID) is an important and challenging task in intelligent video surveillance. The cross-modality discrepancy is mainly caused by the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2024:TFI, author = "Jiajun Song and Zhuo Li and Weiqing Min and Shuqiang Jiang", title = "Towards Food Image Retrieval via Generalization-Oriented Sampling and Loss Function Design", journal = j-TOMM, volume = "20", number = "1", pages = "13:1--13:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3600095", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3600095", abstract = "Food computing has increasingly received widespread attention in the multimedia field. As a basic task of food computing, food image retrieval has wide applications, that is, food image retrieval can help users to find the desired food from a large number \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jin:2024:CBN, author = "Yiting Jin and Jie Wu and Wanliang Wang and Yidong Yan and Jiawei Jiang and Jianwei Zheng", title = "Cascading Blend Network for Image Inpainting", journal = j-TOMM, volume = "20", number = "1", pages = "14:1--14:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3608952", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3608952", abstract = "Image inpainting refers to filling in unknown regions with known knowledge, which is in full flourish accompanied by the popularity and prosperity of deep convolutional networks. Current inpainting methods have excelled in completing small-sized \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2024:DLS, author = "Kehua Guo and Liang Chen and Xiangyuan Zhu and Xiaoyan Kui and Jian Zhang and Heyuan Shi", title = "Double-Layer Search and Adaptive Pooling Fusion for Reference-Based Image Super-Resolution", journal = j-TOMM, volume = "20", number = "1", pages = "15:1--15:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3604937", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3604937", abstract = "Reference-based image super-resolution (RefSR) aims to reconstruct high-resolution (HR) images from low-resolution (LR) images by introducing HR reference images. The key step of RefSR is to transfer reference features to LR features. However, existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2024:UOF, author = "Jing Zhao and Bin Li and Jiahao Li and Ruiqin Xiong and Yan Lu", title = "A Universal Optimization Framework for Learning-based Image Codec", journal = j-TOMM, volume = "20", number = "1", pages = "16:1--16:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3580499", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580499", abstract = "Recently, machine learning-based image compression has attracted increasing interests and is approaching the state-of-the-art compression ratio. But unlike traditional codec, it lacks a universal optimization method to seek efficient representation for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:ICS, author = "Liping Zhang and Shukai Chen and Fei Lin and Wei Ren and Kim-Kwang Raymond Choo and Geyong Min", title = "{$1$DIEN}: Cross-session Electrocardiogram Authentication Using {$1$D} Integrated {EfficientNet}", journal = j-TOMM, volume = "20", number = "1", pages = "17:1--17:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3609800", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3609800", abstract = "The potential of using electrocardiogram (ECG), an important physiological signal for humans, as a new biometric trait has been demonstrated, and ongoing efforts have focused on utilizing deep learning (e.g., 2D neural networks) to improve authentication \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:DMP, author = "Baian Chen and Zhilei Chen and Xiaowei Hu and Jun Xu and Haoran Xie and Jing Qin and Mingqiang Wei", title = "Dynamic Message Propagation Network for {RGB-D} and Video Salient Object Detection", journal = j-TOMM, volume = "20", number = "1", pages = "18:1--18:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3597612", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597612", abstract = "Exploiting long-range semantic contexts and geometric information is crucial to infer salient objects from RGB and depth features. However, existing methods mainly focus on excavating local features within fixed regions by continuously feeding forward \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2024:SSM, author = "Xiang Gao and Wei Hu and Guo-Jun Qi", title = "Self-supervised Multi-view Learning via Auto-encoding {$3$D} Transformations", journal = j-TOMM, volume = "20", number = "1", pages = "19:1--19:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3597613", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597613", abstract = "3D object representation learning is a fundamental challenge in computer vision to infer about the 3D world. Recent advances in deep learning have shown their efficiency in 3D object recognition, among which view-based methods have performed best so far. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:EAE, author = "Dewang Wang and Gaobo Yang and Zhiqing Guo and Jiyou Chen", title = "Enhancing Adversarial Embedding based Image Steganography via Clustering Modification Directions", journal = j-TOMM, volume = "20", number = "1", pages = "20:1--20:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3603377", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3603377", abstract = "Image steganography is a technique used to conceal secret information within cover images without being detected. However, the advent of convolutional neural networks (CNNs) has threatened the security of image steganography. Due to the inherent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2024:DHO, author = "Xiaojia Zhao and Tingting Xu and Qiangqiang Shen and Youfa Liu and Yongyong Chen and Jingyong Su", title = "Double High-Order Correlation Preserved Robust Multi-View Ensemble Clustering", journal = j-TOMM, volume = "20", number = "1", pages = "21:1--21:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612923", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612923", abstract = "Ensemble clustering (EC), utilizing multiple basic partitions (BPs) to yield a robust consensus clustering, has shown promising clustering performance. Nevertheless, most current algorithms suffer from two challenging hurdles: (1) a surge of EC-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tasaka:2024:UQM, author = "Shuji Tasaka", title = "Usefulness of {QoS} in Multidimensional {QoE} Prediction for Haptic-Audiovisual Communications", journal = j-TOMM, volume = "20", number = "1", pages = "22:1--22:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3613246", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3613246", abstract = "This article investigates prediction of Quality of Experience (QoE) by comparing borrowing-from-neighbor situations and isolated ones. We demonstrate that joint utilization of multiple QoE measures enhances the accuracy of QoE prediction compared to that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:EIC, author = "Ching-Nung Yang and Xiaotian Wu and Min-Jung Chung", title = "Enhancement of Information Carrying and Decoding for Visual Cryptography with Error Correction", journal = j-TOMM, volume = "20", number = "1", pages = "23:1--23:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612927", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612927", abstract = "Recently, three visual cryptography schemes with t -error-correcting capability (VCSs- t EC) were introduced for preventing the shadows carrying additional information from being corrupted by noise interference. However, the concerns on VCS- t EC, such as the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:SSV, author = "Yuqing Zhang and Yong Zhang and Shaofan Wang and Yun Liang and Baocai Yin", title = "Semi-supervised Video Object Segmentation Via an Edge Attention Gated Graph Convolutional Network", journal = j-TOMM, volume = "20", number = "1", pages = "24:1--24:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3611389", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3611389", abstract = "Video object segmentation (VOS) exhibits heavy occlusions, large deformation, and severe motion blur. While many remarkable convolutional neural networks are devoted to the VOS task, they often mis-identify background noise as the target or output coarse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wen:2024:VSI, author = "Wenying Wen and Minghui Huang and Yushu Zhang and Yuming Fang and Yifan Zuo", title = "Visual Security Index Combining {CNN} and Filter for Perceptually Encrypted Light Field Images", journal = j-TOMM, volume = "20", number = "1", pages = "25:1--25:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612924", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612924", abstract = "Visual security index (VSI) represents a quantitative index for the visual security evaluation of perceptually encrypted images. Recently, the research on visual security of encrypted light field (LF) images faces two challenges. One is that the existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:CCS, author = "Linlin Liu and Haijun Zhang and Qun Li and Jianghong Ma and Zhao Zhang", title = "Collocated Clothing Synthesis with {GANs} Aided by Textual Information: a Multi-Modal Framework", journal = j-TOMM, volume = "20", number = "1", pages = "26:1--26:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3614097", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3614097", abstract = "Synthesizing realistic images of fashion items which are compatible with given clothing images, as well as conditioning on multiple modalities, brings novel and exciting applications together with enormous economic potential. In this work, we propose a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lou:2024:SSC, author = "Xulei Lou and Tinghui Wu and Haifeng Hu and Dihu Chen", title = "Self-Supervised Consistency Based on Joint Learning for Unsupervised Person Re-identification", journal = j-TOMM, volume = "20", number = "1", pages = "27:1--27:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612926", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612926", abstract = "Recently, unsupervised domain adaptive person re-identification (Re-ID) methods have been extensively studied thanks to not requiring annotations, and they have achieved excellent performance. Most of the existing methods aim to train the Re-ID model for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:CAP, author = "Yichi Zhang and Gongchun Ding and Dandan Ding and Zhan Ma and Zhu Li", title = "On Content-Aware Post-Processing: Adapting Statistically Learned Models to Dynamic Content", journal = j-TOMM, volume = "20", number = "1", pages = "28:1--28:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612925", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612925", abstract = "Learning-based post-processing methods generally produce neural models that are statistically optimal on their training datasets. These models, however, neglect intrinsic variations of local video content and may fail to process unseen content. To address \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2024:DIC, author = "Jing Xu and Bing Liu and Yong Zhou and Mingming Liu and Rui Yao and Zhiwen Shao", title = "Diverse Image Captioning via Conditional Variational Autoencoder and Dual Contrastive Learning", journal = j-TOMM, volume = "20", number = "1", pages = "29:1--29:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3614435", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3614435", abstract = "Diverse image captioning has achieved substantial progress in recent years. However, the discriminability of generative models and the limitation of cross entropy loss are generally overlooked in the traditional diverse image captioning models, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zou:2024:CLN, author = "Cong Zou and Rui Wang and Cheng Jin and Sanyi Zhang and Xin Wang", title = "{S$^2$CL-LeafNet}: Recognizing Leaf Images Like Human Botanists", journal = j-TOMM, volume = "20", number = "1", pages = "30:1--30:??", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3615659", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Sep 29 07:50:48 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3615659", abstract = "Automatically classifying plant leaves is a challenging fine-grained classification task because of the diversity in leaf morphology, including size, texture, shape, and venation. Although powerful deep learning-based methods have achieved great \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Namasudra:2024:ISI, author = "Suyel Namasudra and Pascal Lorenz and Seifedine Kadry and Syed Ahmad Chan Bukhari", title = "Introduction to the Special Issue on {DNA}-centric Modeling and Practice for Next-generation Computing and Communication Systems", journal = j-TOMM, volume = "20", number = "2", pages = "31:1--31:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3578364", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3578364", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "31", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wan:2024:ESI, author = "Shaohua Wan and Yi Jin and Guangdong Xu and Michele Nappi", title = "Editorial to Special Issue on Multimedia Cognitive Computing for Intelligent Transportation System", journal = j-TOMM, volume = "20", number = "2", pages = "32:1--32:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3604938", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3604938", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "32", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2024:TEL, author = "Ruonan Zhao and Laurence T. Yang and Debin Liu and Wanli Lu and Chenlu Zhu and Yiheng Ruan", title = "Tensor-Empowered {LSTM} for Communication-Efficient and Privacy-Enhanced Cognitive Federated Learning in Intelligent Transportation Systems", journal = j-TOMM, volume = "20", number = "2", pages = "33:1--33:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3575661", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3575661", abstract = "Multimedia cognitive computing as a revolutionary emerging concept of artificial intelligence emulating the reasoning process like human brains can facilitate the evolution of intelligent transportation systems (ITS) to be smarter, safer, and more \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "33", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2024:RSB, author = "Hongjian Shi and Hao Wang and Ruhui Ma and Yang Hua and Tao Song and Honghao Gao and Haibing Guan", title = "Robust Searching-Based Gradient Collaborative Management in Intelligent Transportation System", journal = j-TOMM, volume = "20", number = "2", pages = "34:1--34:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3549939", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3549939", abstract = "With the rapid development of big data and the Internet of Things (IoT), traffic data from an Intelligent Transportation System (ITS) is becoming more and more accessible. To understand and simulate the traffic patterns from the traffic data, Multimedia \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "34", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Weng:2024:HHC, author = "Zejia Weng and Zuxuan Wu and Hengduo Li and Jingjing Chen and Yu-Gang Jiang", title = "{HCMS}: Hierarchical and Conditional Modality Selection for Efficient Video Recognition", journal = j-TOMM, volume = "20", number = "2", pages = "35:1--35:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3572776", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3572776", abstract = "Videos are multimodal in nature. Conventional video recognition pipelines typically fuse multimodal features for improved performance. However, this is not only computationally expensive but also neglects the fact that different videos rely on different \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "35", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:DAS, author = "Shixiong Zhang and Wenmin Wang and Honglei Li and Shenyong Zhang", title = "E-detector: Asynchronous Spatio-temporal for Event-based Object Detection in Intelligent Transportation System", journal = j-TOMM, volume = "20", number = "2", pages = "36:1--36:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3584361", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3584361", abstract = "In intelligent transportation systems, various sensors, including radar and conventional frame cameras, are used to improve system robustness in various challenging scenarios. An event camera is a novel bio-inspired sensor that has attracted the interest \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "36", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Padhy:2024:MVA, author = "Ram Prasad Padhy and Pankaj Kumar Sa and Fabio Narducci and Carmen Bisogni and Sambit Bakshi", title = "Monocular Vision-aided Depth Measurement from {RGB} Images for Autonomous {UAV} Navigation", journal = j-TOMM, volume = "20", number = "2", pages = "37:1--37:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3550485", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3550485", abstract = "Monocular vision-based 3D scene understanding has been an integral part of many machine vision applications. Always, the objective is to measure the depth using a single RGB camera, which is at par with the depth cameras. In this regard, monocular vision-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "37", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2024:SID, author = "Zhihan Lv and Fabio Poiesi and Qi Dong and Jaime Lloret and Houbing Song", title = "Special Issue on Deep Learning for Intelligent Human Computer Interaction", journal = j-TOMM, volume = "20", number = "2", pages = "38:1--38:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3605151", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3605151", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "38", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gong:2024:MMM, author = "Wenjuan Gong and Yue Zhang and Wei Wang and Peng Cheng and Jordi Gonz{\`a}lez", title = "{Meta-MMFNet}: Meta-learning-based Multi-model Fusion Network for Micro-expression Recognition", journal = j-TOMM, volume = "20", number = "2", pages = "39:1--39:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3539576", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3539576", abstract = "Despite its wide applications in criminal investigations and clinical communications with patients suffering from autism, automatic micro-expression recognition remains a challenging problem because of the lack of training data and imbalanced classes \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "39", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Djenouri:2024:EAG, author = "Youcef Djenouri and Asma Belhadi and Gautam Srivastava and Jerry Chun-Wei Lin", title = "An Efficient and Accurate {GPU}-based Deep Learning Model for Multimedia Recommendation", journal = j-TOMM, volume = "20", number = "2", pages = "40:1--40:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3524022", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3524022", abstract = "This article proposes the use of deep learning in human-computer interaction and presents a new explainable hybrid framework for recommending relevant hashtags on a set of orpheline tweets, which are tweets with hashtags. The approach starts by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "40", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Loveleen:2024:EDH, author = "Gaur Loveleen and Bhandari Mohan and Bhadwal Singh Shikhar and Jhanjhi Nz and Mohammad Shorfuzzaman and Mehedi Masud", title = "Explanation-Driven {HCI} Model to Examine the Mini-Mental State for {Alzheimer}'s Disease", journal = j-TOMM, volume = "20", number = "2", pages = "41:1--41:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3527174", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3527174", abstract = "Directing research on Alzheimer's disease toward only early prediction and accuracy cannot be considered a feasible approach toward tackling a ubiquitous degenerative disease today. Applying deep learning (DL), Explainable artificial intelligence, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:AAD, author = "Mi Li and Wei Zhang and Bin Hu and Jiaming Kang and Yuqi Wang and Shengfu Lu", title = "Automatic Assessment of Depression and Anxiety through Encoding Pupil-wave from {HCI} in {VR} Scenes", journal = j-TOMM, volume = "20", number = "2", pages = "42:1--42:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3513263", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3513263", abstract = "At present, there have been many studies on the methods of using the deep learning regression model to assess depression level based on behavioral signals (facial expression, speech, and language); however, the research on the assessment method of anxiety \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qayyum:2024:SFB, author = "Abdul Qayyum and Imran Razzak and M. Tanveer and Moona Mazher", title = "Spontaneous Facial Behavior Analysis Using Deep Transformer-based Framework for Child-computer Interaction", journal = j-TOMM, volume = "20", number = "2", pages = "43:1--43:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3539577", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3539577", abstract = "A fascinating challenge in robotics-human interaction is imitating the emotion recognition capability of humans to robots with the aim to make human-robotics interaction natural, genuine and intuitive. To achieve the natural interaction in affective \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:FBH, author = "Xiaowei Chen and Xiao Jiang and Lishuang Zhan and Shihui Guo and Qunsheng Ruan and Guoliang Luo and Minghong Liao and Yipeng Qin", title = "Full-body Human Motion Reconstruction with Sparse Joint Tracking Using Flexible Sensors", journal = j-TOMM, volume = "20", number = "2", pages = "44:1--44:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3564700", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3564700", abstract = "Human motion tracking is a fundamental building block for various applications including computer animation, human-computer interaction, healthcare, and so on. To reduce the burden of wearing multiple sensors, human motion prediction from sparse sensor \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qiao:2024:SSL, author = "Shanbao Qiao and Neal N. Xiong and Yongbin Gao and Zhijun Fang and Wenjun Yu and Juan Zhang and Xiaoyan Jiang", title = "Self-Supervised Learning of Depth and Ego-Motion for {$3$D} Perception in Human Computer Interaction", journal = j-TOMM, volume = "20", number = "2", pages = "45:1--45:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3588571", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3588571", abstract = "3D perception of depth and ego-motion is of vital importance in intelligent agent and Human Computer Interaction (HCI) tasks, such as robotics and autonomous driving. There are different kinds of sensors that can directly obtain 3D depth information. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kang:2024:DGN, author = "Yan Kang and Bin Pu and Yongqi Kou and Yun Yang and Jianguo Chen and Khan Muhammad and Po Yang and Lida Xu and Mohammad Hijji", title = "A Deep Graph Network with Multiple Similarity for User Clustering in Human-Computer Interaction", journal = j-TOMM, volume = "20", number = "2", pages = "46:1--46:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3549954", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3549954", abstract = "User counterparts, such as user attributes in social networks or user interests, are the keys to more natural Human-Computer Interaction (HCI). In addition, users' attributes and social structures help us understand the complex interactions in HCI. Most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mahmud:2024:SHA, author = "Bahar Mahmud and Guan Hong and Bernard Fong", title = "A Study of Human--{AI} Symbiosis for Creative Work: Recent Developments and Future Directions in Deep Learning", journal = j-TOMM, volume = "20", number = "2", pages = "47:1--47:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3542698", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3542698", abstract = "Recent advances in Artificial Intelligence (AI), particularly deep learning, are having an enormous impact on our society today. Record numbers of jobs previously held by people have been automated, from manufacturing to transportation to customer \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gu:2024:PPR, author = "Xiaoling Gu and Jie Huang and Yongkang Wong and Jun Yu and Jianping Fan and Pai Peng and Mohan S. Kankanhalli", title = "{PAINT}: Photo-realistic Fashion Design Synthesis", journal = j-TOMM, volume = "20", number = "2", pages = "48:1--48:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3545610", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3545610", abstract = "In this article, we investigate a new problem of generating a variety of multi-view fashion designs conditioned on a human pose and texture examples of arbitrary sizes, which can replace the repetitive and low-level design work for fashion designers. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dai:2024:UDA, author = "Qingfeng Dai and Yongkang Wong and Guofei Sun and Yanwei Wang and Zhou Zhou and Mohan S. Kankanhalli and Xiangdong Li and Weidong Geng", title = "Unsupervised Domain Adaptation by Causal Learning for Biometric Signal-based {HCI}", journal = j-TOMM, volume = "20", number = "2", pages = "49:1--49:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3583885", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3583885", abstract = "Biometric signal based human-computer interface (HCI) has attracted increasing attention due to its wide application in healthcare, entertainment, neurocomputing, and so on. In recent years, deep learning-based approaches have made great progress on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2024:RRD, author = "Yi Xiao and Tong Liu and Yu Han and Yue Liu and Yongtian Wang", title = "Realtime Recognition of Dynamic Hand Gestures in Practical Applications", journal = j-TOMM, volume = "20", number = "2", pages = "50:1--50:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3561822", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3561822", abstract = "Dynamic hand gesture acting as a semaphoric gesture is a practical and intuitive mid-air gesture interface. Nowadays benefiting from the development of deep convolutional networks, the gesture recognition has already achieved a high accuracy, however, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gou:2024:HMA, author = "Jianping Gou and Liyuan Sun and Baosheng Yu and Shaohua Wan and Dacheng Tao", title = "Hierarchical Multi-Attention Transfer for Knowledge Distillation", journal = j-TOMM, volume = "20", number = "2", pages = "51:1--51:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3568679", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3568679", abstract = "Knowledge distillation (KD) is a powerful and widely applicable technique for the compression of deep learning models. The main idea of knowledge distillation is to transfer knowledge from a large teacher model to a small student model, where the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Deb:2024:AIC, author = "Subhrajyoti Deb and Abhilash Das and Nirmalya Kar", title = "An Applied Image Cryptosystem on {Moore}'s Automaton Operating on {$ \delta (q_k) / F_2 $}", journal = j-TOMM, volume = "20", number = "2", pages = "52:1--52:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3614433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3614433", abstract = "The volume of multimedia-based image data or video frames in Web 3.0 is constantly increasing, owing to the advancement of real-time data transmission. However, security vulnerabilities frequently impair the performance of real-time applications. Many \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{You:2024:IAV, author = "Sisi You and Yukun Zuo and Hantao Yao and Changsheng Xu", title = "Incremental Audio-Visual Fusion for Person Recognition in Earthquake Scene", journal = j-TOMM, volume = "20", number = "2", pages = "53:1--53:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3614434", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3614434", abstract = "Earthquakes have a profound impact on social harmony and property, resulting in damage to buildings and infrastructure. Effective earthquake rescue efforts require rapid and accurate determination of whether any survivors are trapped in the rubble of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2024:BSG, author = "Shiqi Sun and Danlan Huang and Xiaoming Tao and Chengkang Pan and Guangyi Liu and Changwen Chen", title = "Boosting Scene Graph Generation with Contextual Information", journal = j-TOMM, volume = "20", number = "2", pages = "54:1--54:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3615868", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3615868", abstract = "Scene graph generation (SGG) has been developed to detect objects and their relationships from the visual data and has attracted increasing attention in recent years. Existing works have focused on extracting object context for SGG. However, very few \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2024:CAG, author = "Jianwei Zheng and Yu Liu and Yuchao Feng and Honghui Xu and Meiyu Zhang", title = "Contrastive Attention-guided Multi-level Feature Registration for Reference-based Super-resolution", journal = j-TOMM, volume = "20", number = "2", pages = "55:1--55:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3616495", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3616495", abstract = "Given low-quality input and assisted by referential images, reference-based super-resolution (RefSR) strives to enlarge the spatial size with the guarantee of realistic textures, for which sophisticated feature-matching strategies are naturally demanded. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:AAL, author = "Shangxi Wu and Jitao Sang and Kaiyan Xu and Guanhua Zheng and Changsheng Xu", title = "Adaptive Adversarial Logits Pairing", journal = j-TOMM, volume = "20", number = "2", pages = "56:1--56:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3616375", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3616375", abstract = "Adversarial examples provide an opportunity as well as impose a challenge for understanding image classification systems. Based on the analysis of the adversarial training solution-Adversarial Logits Pairing (ALP), we observed in this work that: (1) The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:BBA, author = "Ying Chen and Rui Yao and Yong Zhou and Jiaqi Zhao and Bing Liu and Abdulmotaleb {El Saddik}", title = "Black-box Attack against Self-supervised Video Object Segmentation Models with Contrastive Loss", journal = j-TOMM, volume = "20", number = "2", pages = "57:1--57:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617502", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617502", abstract = "Deep learning models have been proven to be susceptible to malicious adversarial attacks, which manipulate input images to deceive the model into making erroneous decisions. Consequently, the threat posed to these models serves as a poignant reminder of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:RFO, author = "Shuang Liang and Wentao Ma and Chi Xie", title = "Relation with Free Objects for Action Recognition", journal = j-TOMM, volume = "20", number = "2", pages = "58:1--58:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617596", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617596", abstract = "Relevant objects are widely used for aiding human action recognition in still images. Such objects are founded by a dedicated and pre-trained object detector in all previous methods. Such methods have two drawbacks. First, training an object detector \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2024:FMW, author = "Qiaolin He and Zhijie Zheng and Haifeng Hu", title = "A Feature Map is Worth a Video Frame: Rethinking Convolutional Features for Visible-Infrared Person Re-identification", journal = j-TOMM, volume = "20", number = "2", pages = "59:1--59:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617375", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617375", abstract = "Visible-Infrared Person Re-identification (VI-ReID) aims to search for the identity of the same person across different spectra. The feature maps obtained from the convolutional layers are generally used for loss calculation in the later stages of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2024:GCL, author = "Wuliang Huang and Yiqiang Chen and Xinlong Jiang and Teng Zhang and Qian Chen", title = "{GJFusion}: a Channel-Level Correlation Construction Method for Multimodal Physiological Signal Fusion", journal = j-TOMM, volume = "20", number = "2", pages = "60:1--60:??", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617503", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Fri Nov 3 14:55:26 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617503", abstract = "Physiological signal based ubiquitous computing has garnered significant attention. However, the heterogeneity among multimodal physiological signals poses a critical challenge to practical applications. To traverse this heterogeneity gap, recent studies \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shen:2024:SAC, author = "Chengji Shen and Zhenjiang Liu and Xin Gao and Zunlei Feng and Mingli Song", title = "Self-Adaptive Clothing Mapping Based Virtual Try-on", journal = j-TOMM, volume = "20", number = "3", pages = "61:1--61:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3613453", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3613453", abstract = "VTON (Virtual Try-ON), as an innovative visual application in e-commerce scenarios with great commercial value, has been widely studied in recent years. Due to its better robustness and realistic effect, deformation-synthesize-based VTON has become the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Baldrati:2024:CIR, author = "Alberto Baldrati and Marco Bertini and Tiberio Uricchio and Alberto {Del Bimbo}", title = "Composed Image Retrieval using Contrastive Learning and Task-oriented {CLIP}-based Features", journal = j-TOMM, volume = "20", number = "3", pages = "62:1--62:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617597", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617597", abstract = "Given a query composed of a reference image and a relative caption, the Composed Image Retrieval goal is to retrieve images visually similar to the reference one that integrates the modifications expressed by the caption. Given that recent research has \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:CMM, author = "Yan Wang and Peize Li and Qingyi Si and Hanwen Zhang and Wenyu Zang and Zheng Lin and Peng Fu", title = "Cross-modality Multiple Relations Learning for Knowledge-based Visual Question Answering", journal = j-TOMM, volume = "20", number = "3", pages = "63:1--63:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3618301", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3618301", abstract = "Knowledge-based visual question answering not only needs to answer the questions based on images but also incorporates external knowledge to study reasoning in the joint space of vision and language. To bridge the gap between visual content and semantic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2024:IDB, author = "Qiang Guo and Zhi Zhang and Mingliang Zhou and Hong Yue and Huayan Pu and Jun Luo", title = "Image Defogging Based on Regional Gradient Constrained Prior", journal = j-TOMM, volume = "20", number = "3", pages = "64:1--64:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617834", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617834", abstract = "Foggy days limit the functionality of outdoor surveillance systems. However, it is still a challenge for existing methods to maintain the uniformity of defogging between image regions with a similar depth of field and large differences in appearance. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2024:PDP, author = "Jintao Guo and Lei Qi and Yinghuan Shi and Yang Gao", title = "{PLACE Dropout}: a Progressive Layer-wise and Channel-wise Dropout for Domain Generalization", journal = j-TOMM, volume = "20", number = "3", pages = "65:1--65:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624015", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624015", abstract = "Domain generalization (DG) aims to learn a generic model from multiple observed source domains that generalizes well to arbitrary unseen target domains without further training. The major challenge in DG is that the model inevitably faces a severe \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiong:2024:VLS, author = "Yuan Xiong and Jingru Wang and Zhong Zhou", title = "{VirtualLoc}: Large-scale Visual Localization Using Virtual Images", journal = j-TOMM, volume = "20", number = "3", pages = "66:1--66:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3622788", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3622788", abstract = "Robust and accurate camera pose estimation is fundamental in computer vision. Learning-based regression approaches acquire six-degree-of-freedom camera parameters accurately from visual cues of an input image. However, most are trained on street-view and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:ECD, author = "Yiheng Zhang and Ting Yao and Zhaofan Qiu and Tao Mei", title = "Explaining Cross-domain Recognition with Interpretable Deep Classifier", journal = j-TOMM, volume = "20", number = "3", pages = "67:1--67:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3623399", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3623399", abstract = "The recent advances in deep learning predominantly construct models in their internal representations, and it is opaque to explain the rationale behind and decisions to human users. Such explainability is especially essential for domain adaptation, whose \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:AGM, author = "Ruimin Wang and Fasheng Wang and Yiming Su and Jing Sun and Fuming Sun and Haojie Li", title = "Attention-guided Multi-modality Interaction Network for {RGB-D} Salient Object Detection", journal = j-TOMM, volume = "20", number = "3", pages = "68:1--68:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624747", abstract = "The past decade has witnessed great progress in RGB-D salient object detection (SOD). However, there are two bottlenecks that limit its further development. The first one is low-quality depth maps. Most existing methods directly use raw depth maps to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rime:2024:HWY, author = "Jemily Rime and Alan Archer-Boyd and Tom Collins", title = "How Will You Pod? {Implications} of Creators' Perspectives for Designing Innovative Podcasting Tools", journal = j-TOMM, volume = "20", number = "3", pages = "69:1--69:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625099", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625099", abstract = "While centred on the medium of audio, podcasts are often a multimedia concern, and one that has become hugely popular in recent years, though relatively little is known about the perspectives of podcast creators and their visions of innovation. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cheung:2024:LPF, author = "Ming Cheung", title = "Learning from the Past: Fast {NAS} for Tasks and Datasets", journal = j-TOMM, volume = "20", number = "3", pages = "70:1--70:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3618000", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3618000", abstract = "Nowadays, with the advancement of technology, many retail companies require in-house data scientist teams to build machine learning tasks, such as user segmentation and item price prediction. These teams typically use a trial-and-error process to obtain a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:UIQ, author = "Xinyue Li and Haiyong Xu and Gangyi Jiang and Mei Yu and Ting Luo and Xuebo Zhang and Hongwei Ying", title = "Underwater Image Quality Assessment from Synthetic to Real-world: Dataset and Objective Method", journal = j-TOMM, volume = "20", number = "3", pages = "71:1--71:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624983", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624983", abstract = "The complicated underwater environment and lighting conditions lead to severe influence on the quality of underwater imaging, which tends to impair underwater exploration and research. To effectively evaluate the quality of underwater images, an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hou:2024:DLL, author = "Sujuan Hou and Jiacheng Li and Weiqing Min and Qiang Hou and Yanna Zhao and Yuanjie Zheng and Shuqiang Jiang", title = "Deep Learning for Logo Detection: a Survey", journal = j-TOMM, volume = "20", number = "3", pages = "72:1--72:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3611309", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3611309", abstract = "Logo detection has gradually become a research hotspot in the field of computer vision and multimedia for its various applications, such as social media monitoring, intelligent transportation, and video advertising recommendation. Recent advances in this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2024:DLB, author = "Yunjie Peng and Jinlin Wu and Boqiang Xu and Chunshui Cao and Xu Liu and Zhenan Sun and Zhiqiang He", title = "Deep Learning Based Occluded Person Re-Identification: a Survey", journal = j-TOMM, volume = "20", number = "3", pages = "73:1--73:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3610534", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3610534", abstract = "Occluded person re-identification (Re-ID) focuses on addressing the occlusion problem when retrieving the person of interest across non-overlapping cameras. With the increasing demand for intelligent video surveillance and the application of person Re-ID \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Manzoor:2024:MRL, author = "Muhammad Arslan Manzoor and Sarah Albarri and Ziting Xian and Zaiqiao Meng and Preslav Nakov and Shangsong Liang", title = "Multimodality Representation Learning: a Survey on Evolution, Pretraining and Its Applications", journal = j-TOMM, volume = "20", number = "3", pages = "74:1--74:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617833", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617833", abstract = "Multimodality Representation Learning, as a technique of learning to embed information from different modalities and their correlations, has achieved remarkable success on a variety of applications, such as Visual Question Answering (VQA), Natural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2024:BFS, author = "Yanyan Shi and Shaowu Yang and Wenjing Yang and Dianxi Shi and Xuehui Li", title = "Boosting Few-shot Object Detection with Discriminative Representation and Class Margin", journal = j-TOMM, volume = "20", number = "3", pages = "75:1--75:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3608478", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3608478", abstract = "Classifying and accurately locating a visual category with few annotated training samples in computer vision has motivated the few-shot object detection technique, which exploits transferring the source-domain detection model to the target domain. Under \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cheng:2024:VFH, author = "Harry Cheng and Yangyang Guo and Tianyi Wang and Qi Li and Xiaojun Chang and Liqiang Nie", title = "Voice-Face Homogeneity Tells Deepfake", journal = j-TOMM, volume = "20", number = "3", pages = "76:1--76:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625231", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625231", abstract = "Detecting forgery videos is highly desirable due to the abuse of deepfake. Existing detection approaches contribute to exploring the specific artifacts in deepfake videos and fit well on certain data. However, the growing technique on these artifacts \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ye:2024:VSA, author = "Jin Ye and Meng Dan and Wenchao Jiang", title = "A Visual Sensitivity Aware {ABR} Algorithm for {DASH} via Deep Reinforcement Learning", journal = j-TOMM, volume = "20", number = "3", pages = "77:1--77:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3591108", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3591108", abstract = "In order to cope with the fluctuation of network bandwidth and provide smooth video services, adaptive video streaming technology is proposed. In particular, the adaptive bitrate (ABR) algorithm is widely used in dynamic adaptive streaming over HTTP (DASH). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:TRH, author = "Jian Wang and Xiao Wang and Guosheng Zhao", title = "Task Recommendation via Heterogeneous Multi-modal Features and Decision Fusion in Mobile Crowdsensing", journal = j-TOMM, volume = "20", number = "3", pages = "78:1--78:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626239", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3626239", abstract = "In the decision-making process of the behavior of mobile crowdsensing, using a single view to learn a user's preference will lead to a mismatch between the user's wishes and the final task recommendation list, resulting in the low efficiency of the model \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lei:2024:BDV, author = "Si-Chao Lei and Yue-Jiao Gong and Xiao-Lin Xiao and Yi-cong Zhou and Jun Zhang", title = "Boosting Diversity in Visual Search with {Pareto} Non-Dominated Re-Ranking", journal = j-TOMM, volume = "20", number = "3", pages = "79:1--79:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625296", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625296", abstract = "The field of visual search has gained significant attention recently, particularly in the context of web search engines and e-commerce product search platforms. However, the abundance of web images presents a challenge for modern image retrieval systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:ISS, author = "Huijie Zhang and Pu Li and Xiaobai Liu and Xianfeng Yang and Li An", title = "An Iterative Semi-supervised Approach with Pixel-wise Contrastive Loss for Road Extraction in Aerial Images", journal = j-TOMM, volume = "20", number = "3", pages = "80:1--80:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3606374", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3606374", abstract = "Extracting roads in aerial images has numerous applications in artificial intelligence and multimedia computing, including traffic pattern analysis and parking space planning. Learning deep neural networks, though very successful, demand vast amounts of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fang:2024:IAS, author = "Jing Fang and Yinbo Yu and Zhongyuan Wang and Xin Ding and Ruimin Hu", title = "An Image Arbitrary-Scale Super-Resolution Network Using Frequency-domain Information", journal = j-TOMM, volume = "20", number = "3", pages = "81:1--81:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3616376", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3616376", abstract = "Image super-resolution (SR) is a technique to recover lost high-frequency information in low-resolution (LR) images. Since spatial-domain information has been widely exploited, there is a new trend to involve frequency-domain information in SR tasks. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Luo:2024:TES, author = "Xiao Luo and Wei Ju and Yiyang Gu and Yifang Qin and Siyu Yi and Daqing Wu and Luchen Liu and Ming Zhang", title = "Toward Effective Semi-supervised Node Classification with Hybrid Curriculum Pseudo-labeling", journal = j-TOMM, volume = "20", number = "3", pages = "82:1--82:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626528", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3626528", abstract = "Semi-supervised node classification is a crucial challenge in relational data mining and has attracted increasing interest in research on graph neural networks (GNNs). However, previous approaches merely utilize labeled nodes to supervise the overall \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2024:FDN, author = "Wen Guo and Wuzhou Quan and Junyu Gao and Tianzhu Zhang and Changsheng Xu", title = "Feature Disentanglement Network: Multi-Object Tracking Needs More Differentiated Features", journal = j-TOMM, volume = "20", number = "3", pages = "83:1--83:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626825", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3626825", abstract = "To reduce computational redundancies, a common approach is to integrate detection and re-identification (Re-ID) into a single network in multi-object tracking (MOT), referred to as ``tracking by detection.'' Most of the previous research has focused on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Khaleel:2024:VVC, author = "Mohammed Khaleel and Azeez Idris and Wallapak Tavanapong and Jacob R. Pratt and Junghwan Oh and Piet C. de Groen", title = "{VisActive}: Visual-concept-based Active Learning for Image Classification under Class Imbalance", journal = j-TOMM, volume = "20", number = "3", pages = "84:1--84:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617999", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617999", abstract = "Active learning methods recommend the most informative images from a large unlabeled dataset for manual labeling. These methods improve the performance of an image classifier while minimizing manual labeling efforts. We propose VisActive, a visual-concept-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:GLB, author = "Honghua Chen and Zhiqi Li and Mingqing Wei and Jun Wang", title = "Geometric and Learning-Based Mesh Denoising: a Comprehensive Survey", journal = j-TOMM, volume = "20", number = "3", pages = "85:1--85:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625098", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625098", abstract = "Mesh denoising is a fundamental problem in digital geometry processing. It seeks to remove surface noise while preserving surface intrinsic signals as accurately as possible. While traditional wisdom has been built upon specialized priors to smooth \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2024:BNL, author = "Ning Han and Yawen Zeng and Chuhao Shi and Guangyi Xiao and Hao Chen and Jingjing Chen", title = "{BiC-Net}: Learning Efficient Spatio-temporal Relation for Text-Video Retrieval", journal = j-TOMM, volume = "20", number = "3", pages = "86:1--86:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3627103", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3627103", abstract = "The task of text-video retrieval aims to understand the correspondence between language and vision and has gained increasing attention in recent years. Recent works have demonstrated the superiority of local spatio-temporal relation learning with graph-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2024:ADD, author = "Yuan Feng and Yaojun Hu and Pengfei Fang and Sheng Liu and Yanhong Yang and Shengyong Chen", title = "Asymmetric Dual-Decoder {U-Net} for Joint Rain and Haze Removal", journal = j-TOMM, volume = "20", number = "3", pages = "87:1--87:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3628451", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3628451", abstract = "This work studies the multi-weather restoration problem. In real-life scenarios, rain and haze, two often co-occurring common weather phenomena, can greatly degrade the clarity and quality of the scene images, leading to a performance drop in the visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2024:SGD, author = "Yurui Xie and Ling Guan", title = "Sparsity-guided Discriminative Feature Encoding for Robust Keypoint Detection", journal = j-TOMM, volume = "20", number = "3", pages = "88:1--88:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3628432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3628432", abstract = "Existing handcrafted keypoint detectors typically focus on designing specific local structures manually while ignoring whether they have enough flexibility to explore diverse visual patterns in an image. Despite the advancement of learning-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Beuve:2024:HLD, author = "Nicolas Beuve and Wassim Hamidouche and Olivier D{\'e}forges", title = "Hierarchical Learning and Dummy Triplet Loss for Efficient Deepfake Detection", journal = j-TOMM, volume = "20", number = "3", pages = "89:1--89:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626101", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3626101", abstract = "The advancement of generative models has made it easier to create highly realistic Deepfake videos. This accessibility has led to a surge in research on Deepfake detection to mitigate potential misuse. Typically, Deepfake detection models utilize binary \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2024:RPR, author = "Suncheng Xiang and Dahong Qian and Jingsheng Gao and Zirui Zhang and Ting Liu and Yuzhuo Fu", title = "Rethinking Person Re-Identification via Semantic-based Pretraining", journal = j-TOMM, volume = "20", number = "3", pages = "90:1--90:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3628452", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Dec 21 10:47:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3628452", abstract = "Pretraining is a dominant paradigm in computer vision. Generally, supervised ImageNet pretraining is commonly used to initialize the backbones of person re-identification (Re-ID) models. However, recent works show a surprising result that CNN-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2024:HSE, author = "Min Peng and Xiaohu Shao and Yu Shi and Xiangdong Zhou", title = "Hierarchical Synergy-Enhanced Multimodal Relational Network for Video Question Answering", journal = j-TOMM, volume = "20", number = "4", pages = "91:1--91:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630101", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630101", abstract = "Video question answering (VideoQA) is challenging as it requires reasoning about natural language and multimodal interactive relations. Most existing methods apply attention mechanisms to extract interactions between the question and the video or to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ren:2024:CIT, author = "Bin Ren and Hao Tang and Fanyang Meng and Ding Runwei and Philip H. S. Torr and Nicu Sebe", title = "Cloth Interactive Transformer for Virtual Try-On", journal = j-TOMM, volume = "20", number = "4", pages = "92:1--92:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3617374", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3617374", abstract = "The 2D image-based virtual try-on has aroused increased interest from the multimedia and computer vision fields due to its enormous commercial value. Nevertheless, most existing image-based virtual try-on approaches directly combine the person-identity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nie:2024:CSI, author = "Xiushan Nie and Yang Shi and Ziyu Meng and Jin Huang and Weili Guan and Yilong Yin", title = "Complex Scenario Image Retrieval via Deep Similarity-aware Hashing", journal = j-TOMM, volume = "20", number = "4", pages = "93:1--93:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624016", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624016", abstract = "When performing hashing-based image retrieval, it is difficult to learn discriminative hash codes especially for the multi-label, zero-shot and fine-grained settings. This is due to the fact that the similarities vary, even within the same category, under \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tan:2024:CLS, author = "Jiawei Tan and Hongxing Wang and Junsong Yuan", title = "Characters Link Shots: Character Attention Network for Movie Scene Segmentation", journal = j-TOMM, volume = "20", number = "4", pages = "94:1--94:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630257", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630257", abstract = "Movie scene segmentation aims to automatically segment a movie into multiple story units, i.e., scenes, each of which is a series of semantically coherent and time-continual shots. Previous methods have continued efforts on shot semantic association, but \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:RRT, author = "Mingliang Zhou and Xinwen Zhao and Futing Luo and Jun Luo and Huayan Pu and Tao Xiang", title = "Robust {RGB-T} Tracking via Adaptive Modality Weight Correlation Filters and Cross-modality Learning", journal = j-TOMM, volume = "20", number = "4", pages = "95:1--95:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630100", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630100", abstract = "RGBT tracking is gaining popularity due to its ability to provide effective tracking results in a variety of weather conditions. However, feature specificity and complementarity have not been fully used in existing models that directly fuse the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:SOQ, author = "Zicheng Zhang and Wei Sun and Yingjie Zhou and Jun Jia and Zhichao Zhang and Jing Liu and Xiongkuo Min and Guangtao Zhai", title = "Subjective and Objective Quality Assessment for in-the-Wild Computer Graphics Images", journal = j-TOMM, volume = "20", number = "4", pages = "96:1--96:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631357", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3631357", abstract = "Computer graphics images (CGIs) are artificially generated by means of computer programs and are widely perceived under various scenarios, such as games, streaming media, etc. In practice, the quality of CGIs consistently suffers from poor rendering \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Roy:2024:CLV, author = "Shuvendu Roy and Ali Etemad", title = "Contrastive Learning of View-invariant Representations for Facial Expressions Recognition", journal = j-TOMM, volume = "20", number = "4", pages = "97:1--97:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632960", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632960", abstract = "Although there has been much progress in the area of facial expression recognition (FER), most existing methods suffer when presented with images that have been captured from viewing angles that are non-frontal and substantially different from those used \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:GRA, author = "Jun Liu and Jiantao Zhou and Haiwei Wu and Weiwei Sun and Jinyu Tian", title = "Generating Robust Adversarial Examples against Online Social Networks {(OSNs)}", journal = j-TOMM, volume = "20", number = "4", pages = "98:1--98:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632528", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632528", abstract = "Online Social Networks (OSNs) have blossomed into prevailing transmission channels for images in the modern era. Adversarial examples (AEs) deliberately designed to mislead deep neural networks (DNNs) are found to be fragile against the inevitable lossy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yao:2024:CMS, author = "Tao Yao and Yiru Li and Ying Li and Yingying Zhu and Gang Wang and Jun Yue", title = "Cross-modal Semantically Augmented Network for Image-text Matching", journal = j-TOMM, volume = "20", number = "4", pages = "99:1--99:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631356", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3631356", abstract = "Image-text matching plays an important role in solving the problem of cross-modal information processing. Since there are nonnegligible semantic differences between heterogeneous pairwise data, a crucial challenge is how to learn a unified representation. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Telili:2024:DBL, author = "Ahmed Telili and Sid Ahmed Fezza and Wassim Hamidouche and Hanene F. Z. Brachemi Meftah", title = "{2BiVQA}: Double Bi-{LSTM}-based Video Quality Assessment of {UGC} Videos", journal = j-TOMM, volume = "20", number = "4", pages = "100:1--100:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632178", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632178", abstract = "Recently, with the growing popularity of mobile devices as well as video sharing platforms (e.g., YouTube, Facebook, TikTok, and Twitch), User-Generated Content (UGC) videos have become increasingly common and now account for a large portion of multimedia \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:WMS, author = "Hongzhou Chen and Haihan Duan and Maha Abdallah and Yufeng Zhu and Yonggang Wen and Abdulmotaleb {El Saddik} and Wei Cai", title = "{Web3 Metaverse}: State-of-the-Art and Vision", journal = j-TOMM, volume = "20", number = "4", pages = "101:1--101:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630258", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630258", abstract = "The metaverse, as a rapidly evolving socio-technical phenomenon, exhibits significant potential across diverse domains by leveraging Web3 (a.k.a. Web 3.0) technologies such as blockchain, smart contracts, and non-fungible tokens (NFTs). This survey aims \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:GBC, author = "Lilong Wang and Yunhui Shi and Jin Wang and Shujun Chen and Baocai Yin and Nam Ling", title = "Graph Based Cross-Channel Transform for Color Image Compression", journal = j-TOMM, volume = "20", number = "4", pages = "102:1--102:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631710", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3631710", abstract = "Adaptive transform coding is gaining more and more attention for better mining of image content over fixed transforms such as discrete cosine transform (DCT). As a special case, graph transform learning establishes a novel paradigm for the graph-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "102", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2024:SHO, author = "Kai Han and Yu Liu and Rukai Wei and Ke Zhou and Jinhui Xu and Kun Long", title = "Supervised Hierarchical Online Hashing for Cross-modal Retrieval", journal = j-TOMM, volume = "20", number = "4", pages = "103:1--103:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632527", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632527", abstract = "Online cross-modal hashing has gained attention for its adaptability in processing streaming data. However, existing methods only define the hard similarity between data using labels. This results in poor retrieval performance, as they fail to exploit the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2024:SOT, author = "Fengyi Fu and Shancheng Fang and Weidong Chen and Zhendong Mao", title = "Sentiment-Oriented Transformer-Based Variational Autoencoder Network for Live Video Commenting", journal = j-TOMM, volume = "20", number = "4", pages = "104:1--104:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633334", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633334", abstract = "Automatic live video commenting is getting increasing attention due to its significance in narration generation, topic explanation, etc. However, the diverse sentiment consideration of the generated comments is missing from current methods. Sentimental \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2024:JCJ, author = "Yuxiang Peng and Chong Fu and Guixing Cao and Wei Song and Junxin Chen and Chiu-Wing Sham", title = "{JPEG}-compatible Joint Image Compression and Encryption Algorithm with File Size Preservation", journal = j-TOMM, volume = "20", number = "4", pages = "105:1--105:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633459", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633459", abstract = "Joint image compression and encryption algorithms are intensively investigated due to their powerful capability of simultaneous image data compression and sensitive information protection. Unfortunately, most of the existing algorithms suffered from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "105", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:TEC, author = "Daizong Liu and Xiaoye Qu and Jianfeng Dong and Pan Zhou and Zichuan Xu and Haozhao Wang and Xing Di and Weining Lu and Yu Cheng", title = "Transform-Equivariant Consistency Learning for Temporal Sentence Grounding", journal = j-TOMM, volume = "20", number = "4", pages = "106:1--106:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634749", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634749", abstract = "This paper addresses the temporal sentence grounding (TSG). Although existing methods have made decent achievements in this task, they not only severely rely on abundant video-query paired data for training, but also easily fail into the dataset \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2024:STR, author = "Yijie Hu and Bin Dong and Kaizhu Huang and Lei Ding and Wei Wang and Xiaowei Huang and Qiu-Feng Wang", title = "Scene Text Recognition via Dual-path Network with Shape-driven Attention Alignment", journal = j-TOMM, volume = "20", number = "4", pages = "107:1--107:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633517", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633517", abstract = "Scene text recognition (STR), one typical sequence-to-sequence problem, has drawn much attention recently in multimedia applications. To guarantee good performance, it is essential for STR to obtain aligned character-wise features from the whole-image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:NHN, author = "Rongjiao Liang and Shichao Zhang and Wenzhen Zhang and Guixian Zhang and Jinyun Tang", title = "Nonlocal Hybrid Network for Long-tailed Image Classification", journal = j-TOMM, volume = "20", number = "4", pages = "108:1--108:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630256", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630256", abstract = "It is a significant issue to deal with long-tailed data when classifying images. A nonlocal hybrid network (NHN) that takes into account both feature learning and classifier learning is proposed. The NHN can capture the existence of dependencies between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2024:DMC, author = "Piao Shi and Min Hu and Xuefeng Shi and Fuji Ren", title = "Deep Modular Co-Attention Shifting Network for Multimodal Sentiment Analysis", journal = j-TOMM, volume = "20", number = "4", pages = "109:1--109:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634706", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634706", abstract = "Human Multimodal Sentiment Analysis (MSA) is an attractive research that studies sentiment expressed from multiple heterogeneous modalities. While transformer-based methods have achieved great success, designing an effective ``co-attention'' model to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:VLS, author = "Jing Zhang and Dan Guo and Xun Yang and Peipei Song and Meng Wang", title = "Visual-linguistic-stylistic Triple Reward for Cross-lingual Image Captioning", journal = j-TOMM, volume = "20", number = "4", pages = "110:1--110:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634917", abstract = "Generating image captions in different languages is worth exploring and essential for non-native speakers. Nevertheless, collecting paired annotation for every language is time-consuming and impractical, particularly for minor languages. To this end, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jia:2024:ENC, author = "Zhaoyang Jia and Yan Lu and Houqiang Li", title = "Exploring Neighbor Correspondence Matching for Multiple-hypotheses Video Frame Synthesis", journal = j-TOMM, volume = "20", number = "4", pages = "111:1--111:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633780", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633780", abstract = "Video frame synthesis, which consists of interpolation and extrapolation, is an essential video processing technique that can be applied to various scenarios. However, most existing methods cannot handle small objects or large motion well, especially in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:GPI, author = "Sheng Zhou and Dan Guo and Xun Yang and Jianfeng Dong and Meng Wang", title = "Graph Pooling Inference Network for Text-based {VQA}", journal = j-TOMM, volume = "20", number = "4", pages = "112:1--112:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634918", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634918", abstract = "Effectively leveraging objects and optical character recognition (OCR) tokens to reason out pivotal scene text is critical for the challenging Text-based Visual Question Answering (TextVQA) task. Graph-based models can effectively capture the semantic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2024:OBS, author = "Hengtong Hu and Lingxi Xie and Xinyue Huo and Richang Hong and Qi Tian", title = "One-Bit Supervision for Image Classification: Problem, Solution, and Beyond", journal = j-TOMM, volume = "20", number = "4", pages = "113:1--113:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633779", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633779", abstract = "This article presents one-bit supervision, a novel setting of learning with fewer labels, for image classification. Instead of the training model using the accurate label of each sample, our setting requires the model to interact with the system by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2024:DCB, author = "Hang Yuan and Wei Gao and Siwei Ma and Yiqiang Yan", title = "Divide-and-conquer-based {RDO}-free {CU} Partitioning for {8K} Video Compression", journal = j-TOMM, volume = "20", number = "4", pages = "114:1--114:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634705", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634705", abstract = "8K (7689$ \times $4320) ultra-high definition (UHD) videos are growing popular with the improvement of human visual experience demand. Therefore, the compression of 8K UHD videos has become a top priority in the third-generation audio video coding standard (AVS3). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:DWA, author = "Mingyu Li and Tao Zhou and Zhuo Huang and Jian Yang and Jie Yang and Chen Gong", title = "Dynamic Weighted Adversarial Learning for Semi-Supervised Classification under Intersectional Class Mismatch", journal = j-TOMM, volume = "20", number = "4", pages = "115:1--115:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635310", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635310", abstract = "Nowadays, class-mismatch problem has drawn intensive attention in Semi-Supervised Learning (SSL), where the classes of labeled data are assumed to be only a subset of the classes of unlabeled data. However, in a more realistic scenario, the labeled data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2024:SLC, author = "Hui Huang and Di Xiao and Jia Liang", title = "Secure Low-complexity Compressive Sensing with Preconditioning Prior Regularization Reconstruction", journal = j-TOMM, volume = "20", number = "4", pages = "116:1--116:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635308", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635308", abstract = "Compressive sensing (CS), a breakthrough technology in image processing, provides a privacy-preserving layer and image reconstruction while performing sensing and recovery processes, respectively. Unfortunately, it still faces high-complexity, low-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "116", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Clement:2024:SDH, author = "Nathan Clement and Alan Schoen and Arnold Boedihardjo and Andrew Jenkins", title = "Synthetic Data and Hierarchical Object Detection in Overhead Imagery", journal = j-TOMM, volume = "20", number = "4", pages = "117:1--117:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635309", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635309", abstract = "The performance of neural network models is often limited by the availability of big datasets. To treat this problem, we survey and develop novel synthetic data generation and augmentation techniques for enhancing low/zero-sample learning in satellite \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bian:2024:PAL, author = "Jiang Bian and Xuhong Li and Tao Wang and Qingzhong Wang and Jun Huang and Chen Liu and Jun Zhao and Feixiang Lu and Dejing Dou and Haoyi Xiong", title = "{P$^2$ANet}: a Large-Scale Benchmark for Dense Action Detection from Table Tennis Match Broadcasting Videos", journal = j-TOMM, volume = "20", number = "4", pages = "118:1--118:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633516", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633516", abstract = "While deep learning has been widely used for video analytics, such as video classification and action detection, dense action detection with fast-moving subjects from sports videos is still challenging. In this work, we release yet another sports video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:AIG, author = "Jifan Yang and Zhongyuan Wang and Guangcheng Wang and Baojin Huang and Yuhong Yang and Weiping Tu", title = "Auxiliary Information Guided Self-attention for Image Quality Assessment", journal = j-TOMM, volume = "20", number = "4", pages = "119:1--119:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635716", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635716", abstract = "Image quality assessment (IQA) is an important problem in computer vision with many applications. We propose a transformer-based multi-task learning framework for the IQA task. Two subtasks: constructing an auxiliary information error map and completing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2024:EVT, author = "Zhanzhou Feng and Jiaming Xu and Lei Ma and Shiliang Zhang", title = "Efficient Video Transformers via Spatial-temporal Token Merging for Action Recognition", journal = j-TOMM, volume = "20", number = "4", pages = "120:1--120:??", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633781", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jan 13 15:13:22 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633781", abstract = "Transformer has exhibited promising performance in various video recognition tasks but brings a huge computational cost in modeling spatial-temporal cues. This work aims to boost the efficiency of existing video transformers for action recognition through \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:PCA, author = "Shupei Zhang and Chenqiu Zhao and Anup Basu", title = "Principal Component Approximation Network for Image Compression", journal = j-TOMM, volume = "20", number = "5", pages = "121:1--121:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637490", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637490", abstract = "In this work, we propose a novel principal component approximation network (PCANet) for image compression. The proposed network is based on the assumption that a set of images can be decomposed into several shared feature matrices, and an image can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:TEC, author = "Tianyu Zhang and Weiqing Min and Tao Liu and Shuqiang Jiang and Yong Rui", title = "Toward Egocentric Compositional Action Anticipation with Adaptive Semantic Debiasing", journal = j-TOMM, volume = "20", number = "5", pages = "122:1--122:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633333", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633333", abstract = "Predicting the unknown from the first-person perspective is expected as a necessary step toward machine intelligence, which is essential for practical applications including autonomous driving and robotics. As a human-level task, egocentric action \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:AVT, author = "Yu Liu and Mingbo Zhao and Zhao Zhang and Yuping Liu and Shuicheng Yan", title = "Arbitrary Virtual Try-on Network: Characteristics Preservation and Tradeoff between Body and Clothing", journal = j-TOMM, volume = "20", number = "5", pages = "123:1--123:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3636426", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3636426", abstract = "Deep learning based virtual try-on system has achieved some encouraging progress recently, but there still remain several big challenges that need to be solved, such as trying on arbitrary clothes of all types, trying on the clothes from one category to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:CCM, author = "Shih-Wei Yang and Li-Hsiang Shen and Hong-Han Shuai and Kai-Ten Feng", title = "{CMAF}: Cross-Modal Augmentation via Fusion for Underwater Acoustic Image Recognition", journal = j-TOMM, volume = "20", number = "5", pages = "124:1--124:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3636427", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3636427", abstract = "Underwater image recognition is crucial for underwater detection applications. Fish classification has been one of the emerging research areas in recent years. Existing image classification models usually classify data collected from terrestrial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:SAR, author = "Yazhou Zhang and Yang Yu and Mengyao Wang and Min Huang and M. Shamim Hossain", title = "Self-Adaptive Representation Learning Model for Multi-Modal Sentiment and Sarcasm Joint Analysis", journal = j-TOMM, volume = "20", number = "5", pages = "125:1--125:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635311", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635311", abstract = "Sentiment and sarcasm are intimate and complex, as sarcasm often deliberately elicits an emotional response in order to achieve its specific purpose. Current challenges in multi-modal sentiment and sarcasm joint detection mainly include multi-modal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qi:2024:DSD, author = "Lei Qi and Peng Dong and Tan Xiong and Hui Xue and Xin Geng", title = "{DoubleAUG}: Single-domain Generalized Object Detector in Urban via Color Perturbation and Dual-style Memory", journal = j-TOMM, volume = "20", number = "5", pages = "126:1--126:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634683", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3634683", abstract = "Object detection in urban scenarios is crucial for autonomous driving in intelligent traffic systems. However, unlike conventional object detection tasks, urban-scene images vary greatly in style. For example, images taken on sunny days differ \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2024:ICM, author = "Dan Shi and Lei Zhu and Jingjing Li and Guohua Dong and Huaxiang Zhang", title = "Incomplete Cross-Modal Retrieval with Deep Correlation Transfer", journal = j-TOMM, volume = "20", number = "5", pages = "127:1--127:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637442", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637442", abstract = "Most cross-modal retrieval methods assume the multi-modal training data is complete and has a one-to-one correspondence. However, in the real world, multi-modal data generally suffers from missing modality information due to the uncertainty of data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2024:MPS, author = "Xianhua Zeng and Xinyu Wang and Yicai Xie", title = "Multiple Pseudo-{Siamese} Network with Supervised Contrast Learning for Medical Multi-modal Retrieval", journal = j-TOMM, volume = "20", number = "5", pages = "128:1--128:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637441", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637441", abstract = "Medical multi-modal retrieval aims to provide doctors with similar medical images from different modalities, which can greatly promote the efficiency and accuracy of clinical diagnosis. However, most existing medical retrieval methods hardly support the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{You:2024:MOT, author = "Sisi You and Hantao Yao and Bing-Kun Bao and Changsheng Xu", title = "Multi-object Tracking with Spatial-Temporal Tracklet Association", journal = j-TOMM, volume = "20", number = "5", pages = "129:1--129:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635155", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635155", abstract = "Recently, the tracking-by-detection methods have achieved excellent performance in Multi-Object Tracking (MOT), which focuses on obtaining a robust feature for each object and generating tracklets based on feature similarity. However, they are confronted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bingol:2024:QEW, author = "G{\"u}lnaziye Bing{\"o}l and Simone Porcu and Alessandro Floris and Luigi Atzori", title = "{QoE} Estimation of {WebRTC}-based Audio-visual Conversations from Facial and Speech Features", journal = j-TOMM, volume = "20", number = "5", pages = "130:1--130:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638251", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638251", abstract = "The utilization of user's facial- and speech-related features for the estimation of the Quality of Experience (QoE) of multimedia services is still underinvestigated despite its potential. Currently, only the use of either facial or speech features \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qiu:2024:LOP, author = "Heqian Qiu and Hongliang Li and Qingbo Wu and Hengcan Shi and Lanxiao Wang and Fanman Meng and Linfeng Xu", title = "Learning Offset Probability Distribution for Accurate Object Detection", journal = j-TOMM, volume = "20", number = "5", pages = "131:1--131:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637214", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637214", abstract = "Object detection combines object classification and object localization problems. Current object detection methods heavily depend on regression networks to locate objects, which are optimized with various regression loss functions to predict offsets \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Floris:2024:CMP, author = "Alessandro Floris and Simone Porcu and Luigi Atzori", title = "Controlling Media Player with Hands: a Transformer Approach and a Quality of Experience Assessment", journal = j-TOMM, volume = "20", number = "5", pages = "132:1--132:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638560", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638560", abstract = "In this article, we propose a Hand Gesture Recognition (HGR) system based on a novel deep transformer (DT) neural network for media player control. The extracted hand skeleton features are processed by separate transformers for each finger in isolation to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:EVR, author = "Jingyu Li and Zhendong Mao and Hao Li and Weidong Chen and Yongdong Zhang", title = "Exploring Visual Relationships via Transformer-based Graphs for Enhanced Image Captioning", journal = j-TOMM, volume = "20", number = "5", pages = "133:1--133:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638558", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638558", abstract = "Image captioning (IC), bringing vision to language, has drawn extensive attention. A crucial aspect of IC is the accurate depiction of visual relations among image objects. Visual relations encompass two primary facets: content relations and structural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2024:HLD, author = "Zeyu Ma and Siwei Wang and Xiao Luo and Zhonghui Gu and Chong Chen and Jinxing Li and Xian-Sheng Hua and Guangming Lu", title = "{HARR}: Learning Discriminative and High-Quality Hash Codes for Image Retrieval", journal = j-TOMM, volume = "20", number = "5", pages = "134:1--134:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3627162", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3627162", abstract = "This article studies deep unsupervised hashing, which has attracted increasing attention in large-scale image retrieval. The majority of recent approaches usually reconstruct semantic similarity information, which then guides the hash code learning. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:CWS, author = "Chengyang Zhang and Yong Zhang and Bo Li and Xinglin Piao and Baocai Yin", title = "{CrowdGraph}: Weakly supervised Crowd Counting via Pure Graph Neural Network", journal = j-TOMM, volume = "20", number = "5", pages = "135:1--135:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638774", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638774", abstract = "Most existing weakly supervised crowd counting methods utilize Convolutional Neural Networks (CNN) or Transformer to estimate the total number of individuals in an image. However, both CNN-based (grid-to-count paradigm) and Transformer-based (sequence-to-. \ldots{})", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:WGO, author = "Jie Wang and Guoqiang Li and Jie Shi and Jinwen Xi", title = "Weighted Guided Optional Fusion Network for {RGB-T} Salient Object Detection", journal = j-TOMM, volume = "20", number = "5", pages = "136:1--136:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624984", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624984", abstract = "There is no doubt that the rational and effective use of visible and thermal infrared image data information to achieve cross-modal complementary fusion is the key to improving the performance of RGB-T salient object detection (SOD). A meticulous analysis \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "136", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:JAV, author = "Yibo Zhang and Weiguo Lin and Junfeng Xu", title = "Joint Audio-Visual Attention with Contrastive Learning for More General Deepfake Detection", journal = j-TOMM, volume = "20", number = "5", pages = "137:1--137:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625100", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625100", abstract = "With the continuous advancement of deepfake technology, there has been a surge in the creation of realistic fake videos. Unfortunately, the malicious utilization of deepfake poses a significant threat to societal morality and political security. Therefore,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "137", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:KIM, author = "Depei Wang and Ruifeng Xu and Lianglun Cheng and Zhuowei Wang", title = "Knowledge-integrated Multi-modal Movie Turning Point Identification", journal = j-TOMM, volume = "20", number = "5", pages = "138:1--138:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638557", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638557", abstract = "The rapid development of artificial intelligence provides rich technologies and tools for the automated understanding of literary works. As a comprehensive carrier of storylines, movies are natural multimodal data sources that provide sufficient data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "138", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:DCF, author = "Chunpu Liu and Guanglei Yang and Wangmeng Zuo and Tianyi Zang", title = "{DPDFormer}: a Coarse-to-Fine Model for Monocular Depth Estimation", journal = j-TOMM, volume = "20", number = "5", pages = "139:1--139:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638559", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638559", abstract = "Monocular depth estimation attracts great attention from computer vision researchers for its convenience in acquiring environment depth information. Recently classification-based MDE methods show its promising performance and begin to act as an essential \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "139", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2024:TSP, author = "Yunyao Yan and Guoqing Xiang and Huizhu Jia and Jie Chen and Xiaofeng Huang and Xiaodong Xie", title = "Two-Stage Perceptual Quality Oriented Rate Control Algorithm for {HEVC}", journal = j-TOMM, volume = "20", number = "5", pages = "140:1--140:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3636510", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3636510", abstract = "As a practical technique in mainstream video coding applications, rate control dominates important to ensure compression quality with limited bitrates constraints. However, most rate control methods mainly focus on objective quality while ignoring the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "140", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:VDG, author = "Zongyi Li and Yuxuan Shi and Hefei Ling and Jiazhong Chen and Boyuan Liu and Runsheng Wang and Chengxin Zhao", title = "Viewpoint Disentangling and Generation for Unsupervised Object {Re-ID}", journal = j-TOMM, volume = "20", number = "5", pages = "141:1--141:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632959", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632959", abstract = "Unsupervised object Re-ID aims to learn discriminative identity features from a fully unlabeled dataset to solve the open-class re-identification problem. Satisfying results have been achieved in existing unsupervised Re-ID methods, primarily trained with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "141", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dai:2024:TLF, author = "Kuai Dai and Xutao Li and Huiwei Lin and Yin Jiang and Xunlai Chen and Yunming Ye and Di Xian", title = "{TinyPredNet}: a Lightweight Framework for Satellite Image Sequence Prediction", journal = j-TOMM, volume = "20", number = "5", pages = "142:1--142:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638773", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638773", abstract = "Satellite image sequence prediction aims to precisely infer future satellite image frames with historical observations, which is a significant and challenging dense prediction task. Though existing deep learning models deliver promising performance for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "142", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2024:RRA, author = "Yingnan Ma and Chenqiu Zhao and Bingran Huang and Xudong Li and Anup Basu", title = "{RAST}: Restorable Arbitrary Style Transfer", journal = j-TOMM, volume = "20", number = "5", pages = "143:1--143:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638770", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638770", abstract = "The objective of arbitrary style transfer is to apply a given artistic or photo-realistic style to a target image. Although current methods have shown some success in transferring style, arbitrary style transfer still has several issues, including content \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "143", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hsu:2024:CDA, author = "Wei-Yen Hsu and Hsien-Wen Lin", title = "Context-detail-aware United Network for Single Image Deraining", journal = j-TOMM, volume = "20", number = "5", pages = "144:1--144:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639407", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3639407", abstract = "Images captured outdoors are often affected by rainy days, resulting in a severe deterioration in the visual quality of the captured images and a decrease in the performance of related applications. Therefore, single image deraining has attracted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "144", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:TSM, author = "Yao Liu and Gangfeng Cui and Jiahui Luo and Xiaojun Chang and Lina Yao", title = "Two-stream Multi-level Dynamic Point Transformer for Two-person Interaction Recognition", journal = j-TOMM, volume = "20", number = "5", pages = "145:1--145:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639470", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3639470", abstract = "As a fundamental aspect of human life, two-person interactions contain meaningful information about people's activities, relationships, and social settings. Human action recognition serves as the foundation for many smart applications, with a strong focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "145", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:MCT, author = "Chengxin Chen and Pengyuan Zhang", title = "Modality-collaborative Transformer with Hybrid Feature Reconstruction for Robust Emotion Recognition", journal = j-TOMM, volume = "20", number = "5", pages = "146:1--146:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640343", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3640343", abstract = "As a vital aspect of affective computing, Multimodal Emotion Recognition has been an active research area in the multimedia community. Despite recent progress, this field still confronts two major challenges in real-world applications: (1) improving the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "146", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2024:UOI, author = "Jiafeng Huang and Tianjun Zhang and Shengjie Zhao and Lin Zhang and Yicong Zhou", title = "An Underwater Organism Image Dataset and a Lightweight Module Designed for Object Detection Networks", journal = j-TOMM, volume = "20", number = "5", pages = "147:1--147:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640465", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3640465", abstract = "Long-term monitoring and recognition of underwater organism objects are of great significance in marine ecology, fisheries science and many other disciplines. Traditional techniques in this field, including manual fishing-based ones and sonar-based ones, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "147", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:PPM, author = "Jing Liu and Litao Shang and Yuting Su and Weizhi Nie and Xin Wen and Anan Liu", title = "Privacy-preserving Multi-source Cross-domain Recommendation Based on Knowledge Graph", journal = j-TOMM, volume = "20", number = "5", pages = "148:1--148:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639706", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3639706", abstract = "The cross-domain recommender systems aim to alleviate the data sparsity problem in the target domain by transferring knowledge from the auxiliary domain. However, existing works ignore the fact that the data sparsity problem may also exist in the single \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "148", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:BDB, author = "Xingyu Liu and Zhongyun Hua and Shuang Yi and Yushu Zhang and Yicong Zhou", title = "Bi-directional Block Encoding for Reversible Data Hiding over Encrypted Images", journal = j-TOMM, volume = "20", number = "5", pages = "149:1--149:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638771", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638771", abstract = "Reversible data hiding over encrypted images (RDH-EI) technology is a viable solution for privacy-preserving cloud storage, as it enables the reversible embedding of additional data into images while maintaining image confidentiality. Since the data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "149", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yi:2024:OVS, author = "Peng Yi and Zhongyuan Wang and Laigan Luo and Kui Jiang and Zheng He and Junjun Jiang and Tao Lu and Jiayi Ma", title = "Omniscient Video Super-Resolution with Explicit-Implicit Alignment", journal = j-TOMM, volume = "20", number = "5", pages = "150:1--150:??", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640346", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Apr 10 08:42:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3640346", abstract = "When considering the temporal relationships, most previous video super-resolution (VSR) methods follow the iterative or recurrent framework. The iterative framework adopts neighboring low-resolution (LR) frames from a sliding window, while the recurrent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "150", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2024:ISI, author = "Amit Kumar Singh and Deepa Kundur and Mauro Conti", title = "Introduction to the Special Issue on Integrity of Multimedia and Multimodal Data in {Internet of Things}", journal = j-TOMM, volume = "20", number = "6", pages = "151:1--151:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3643040", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3643040", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "151", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:BBS, author = "Wenyuan Yang and Shaocong Wu and Jianwei Fei and Xianwang Zeng and Yuemin Ding and Zhihua Xia", title = "A Bitcoin-based Secure Outsourcing Scheme for Optimization Problem in Multimedia {Internet of Things}", journal = j-TOMM, volume = "20", number = "6", pages = "152:1--152:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637489", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637489", abstract = "With the development of the Internet of Things (IoT) and cloud computing, various multimedia data such as audio, video, and images have experienced explosive growth, ushering in the era of big data. Large-scale computing tasks in the Multimedia Internet \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "152", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:PIP, author = "Qingzhi Liu and Yuchen Huang and Chenglu Jin and Xiaohan Zhou and Ying Mao and Cagatay Catal and Long Cheng", title = "Privacy and Integrity Protection for {IoT} Multimodal Data Using Machine Learning and Blockchain", journal = j-TOMM, volume = "20", number = "6", pages = "153:1--153:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638769", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638769", abstract = "With the wide application of Internet of Things (IoT) technology, large volumes of multimodal data are collected and analyzed for various diagnoses, analyses, and predictions to help in decision-making and management. However, the research on protecting \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "153", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jonker:2024:DPE, author = "Simon Jonker and Malthe Jelstrup and Weizhi Meng and Brooke Lampe", title = "Detecting Post Editing of Multimedia Images using Transfer Learning and Fine Tuning", journal = j-TOMM, volume = "20", number = "6", pages = "154:1--154:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633284", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633284", abstract = "In the domain of general image forgery detection, a myriad of different classification solutions have been developed to distinguish a ``tampered'' image from a ``pristine'' image. In this work, we aim to develop a new method to tackle the problem of binary \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "154", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bisogni:2024:IEB, author = "Carmen Bisogni and Lucia Cascone and Michele Nappi and Chiara Pero", title = "{IoT}-enabled Biometric Security: Enhancing Smart Car Safety with Depth-based Head Pose Estimation", journal = j-TOMM, volume = "20", number = "6", pages = "155:1--155:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639367", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3639367", abstract = "Advanced Driver Assistance Systems (ADAS) are experiencing higher levels of automation, facilitated by the synergy among various sensors integrated within vehicles, thereby forming an Internet of Things (IoT) framework. Among these sensors, cameras have \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "155", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nouma:2024:TED, author = "Saif E. Nouma and Attila A. Yavuz", title = "Trustworthy and Efficient Digital Twins in Post-Quantum Era with Hybrid Hardware-Assisted Signatures", journal = j-TOMM, volume = "20", number = "6", pages = "156:1--156:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638250", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3638250", abstract = "Digital Twins (DT) virtually model cyber-physical objects via sensory inputs by simulating or monitoring their behavior. Therefore, DTs usually harbor vast quantities of Internet of Things (IoT) components (e.g., sensors) that gather, process, and offload \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "156", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:VDS, author = "Fan Li and Yanxiang Chen and Haiyang Liu and Zuxing Zhao and Yuanzhi Yao and Xin Liao", title = "Vocoder Detection of Spoofing Speech Based on {GAN} Fingerprints and Domain Generalization", journal = j-TOMM, volume = "20", number = "6", pages = "157:1--157:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3630751", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3630751", abstract = "As an important part of the text-to-speech (TTS) system, vocoders convert acoustic features into speech waveforms. The difference in vocoders is key to producing different types of forged speech in the TTS system. With the rapid development of general \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "157", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2024:IMC, author = "Jing Gao and Peng Li and Asif Ali Laghari and Gautam Srivastava and Thippa Reddy Gadekallu and Sidra Abbas and Jianing Zhang", title = "Incomplete Multiview Clustering via Semidiscrete Optimal Transport for Multimedia Data Mining in {IoT}", journal = j-TOMM, volume = "20", number = "6", pages = "158:1--158:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625548", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625548", abstract = "With the wide deployment of the Internet of Things (IoT), large volumes of incomplete multiview data that violates data integrity is generated by various applications, which inevitably produces negative impacts on the quality of service of IoT systems. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "158", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:PAR, author = "Zhenyu Liu and Da Li and Xinyu Zhang and Zhang Zhang and Peng Zhang and Caifeng Shan and Jungong Han", title = "Pedestrian Attribute Recognition via Spatio-temporal Relationship Learning for Visual Surveillance", journal = j-TOMM, volume = "20", number = "6", pages = "159:1--159:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632624", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:44 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3632624", abstract = "Pedestrian attribute recognition (PAR) aims at predicting the visual attributes of a pedestrian image. PAR has been used as soft biometrics for visual surveillance and IoT security. Most of the current PAR methods are developed based on discrete images. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "159", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Garcia:2024:SNF, author = "Roberto Garc{\'\i}a and Ana Cediel and Merc{\`e} Teixid{\'o} and Rosa Gil", title = "Semantics and Non-fungible Tokens for Copyright Management on the Metaverse and Beyond", journal = j-TOMM, volume = "20", number = "7", pages = "186:1--186:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3585387", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3585387", abstract = "Recent initiatives related to the Metaverse focus on better visualization, like augmented or virtual reality, but also persistent digital objects. To guarantee real ownership of these digital objects, open systems based on public blockchains and Non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "186", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2024:RCA, author = "Tianxiu Xie and Keke Gai and Liehuang Zhu and Shuo Wang and Zijian Zhang", title = "{RAC-Chain}: an Asynchronous Consensus-based Cross-chain Approach to Scalable Blockchain for Metaverse", journal = j-TOMM, volume = "20", number = "7", pages = "187:1--187:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3586011", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3586011", abstract = "The metaverse, as an emerging technical term, conceptually aims to construct a virtual digital space that runs parallel to the physical world. Due to human behaviors and interactions being represented in the virtual world, security in the metaverse is a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "187", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ren:2024:HCC, author = "Yongjun Ren and Zhiying Lv and Neal N. Xiong and Jin Wang", title = "{HCNCT}: a Cross-chain Interaction Scheme for the Blockchain-based Metaverse", journal = j-TOMM, volume = "20", number = "7", pages = "188:1--188:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3594542", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3594542", abstract = "As a new type of digital living space that blends virtual and reality, Metaverse combines many emerging technologies. It provides an immersive experience based on VR technology and stores and protects users' digital content and digital assets through \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "188", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:QQC, author = "Shuangmin Chen and Rui Xu and Jian Xu and Shiqing Xin and Changhe Tu and Chenglei Yang and Lin Lu", title = "{QuickCSGModeling}: Quick {CSG} Operations Based on Fusing Signed Distance Fields for {VR} Modeling", journal = j-TOMM, volume = "20", number = "7", pages = "189:1--189:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3599729", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3599729", abstract = "The latest advancements in Virtual Reality (VR) enable the creation of 3D models within a holographic immersive simulation environment. In this article, we create QuickCSGModeling, a user-friendly mid-air interactive modeling system. We first prepare a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "189", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:PPA, author = "Qinnan Zhang and Zehui Xiong and Jianming Zhu and Sheng Gao and Wanting Yang", title = "A Privacy-preserving Auction Mechanism for Learning Model as an {NFT} in Blockchain-driven Metaverse", journal = j-TOMM, volume = "20", number = "7", pages = "190:1--190:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3599971", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3599971", abstract = "The Metaverse, envisioned as the next-generation Internet, will be constructed via twining a practical world in a virtual form, wherein Meterverse service providers (MSPs) are required to collect massive data from Meterverse users (MUs). In this regard, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "190", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:MMI, author = "Han Wang and Hui Li and Abla Smahi and Feng Zhao and Yao Yao and Ching Chuen Chan and Shiyu Wang and Wenyuan Yang and Shuo-Yen Robert Li", title = "{MIS}: a Multi-Identifier Management and Resolution System in the Metaverse", journal = j-TOMM, volume = "20", number = "7", pages = "191:1--191:??", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3597641", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat May 18 06:38:45 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3597641", abstract = "The metaverse gradually evolves into a virtual world containing a series of interconnected sub-metaverses. Diverse digital resources, including identities, contents, services, and supporting data, are key components of the sub-metaverse. Therefore, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "191", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:HFM, author = "Jinliang Liu and Zhedong Zheng and Zongxin Yang and Yi Yang", title = "High Fidelity Makeup via {$2$D} and {$3$D} Identity Preservation Net", journal = j-TOMM, volume = "20", number = "8", pages = "230:1--230:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656475", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3656475", abstract = "In this article, we address the challenging makeup transfer task, aiming to transfer makeup from a reference image to a source image while preserving facial geometry and background consistency. Existing deep neural network-based methods have shown \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "230", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2024:RTA, author = "Junjian Huang and Hao Ren and Shulin Liu and Yong Liu and Chuanlu Lv and Jiawen Lu and Changyong Xie and Hong Lu", title = "Real-Time Attentive Dilated {$U$}-Net for Extremely Dark Image Enhancement", journal = j-TOMM, volume = "20", number = "8", pages = "231:1--231:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3654668", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3654668", abstract = "Images taken under low-light conditions suffer from poor visibility, color distortion, and graininess, all of which degrade the image quality and hamper the performance of downstream vision tasks, such as object detection and instance segmentation in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "231", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiong:2024:ICI, author = "Mingfu Xiong and Kaikang Hu and Zhihan Lyu and Fei Fang and Zhongyuan Wang and Ruimin Hu and Khan Muhammad", title = "Inter-camera Identity Discrimination for Unsupervised Person Re-identification", journal = j-TOMM, volume = "20", number = "8", pages = "232:1--232:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652858", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3652858", abstract = "Unsupervised person re-identification (Re-ID) has garnered significant attention because of its data-friendly nature, as it does not require labeled data. Existing approaches primarily address this challenge by employing feature-clustering techniques to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "232", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2024:PGE, author = "Jiaqi Yu and Jinhai Yang and Hua Yang and Renjie Pan and Pingrui Lai and Guangtao Zhai", title = "Psychology-Guided Environment Aware Network for Discovering Social Interaction Groups from Videos", journal = j-TOMM, volume = "20", number = "8", pages = "233:1--233:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3657295", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3657295", abstract = "Social interaction is a common phenomenon in human societies. Different from discovering groups based on the similarity of individuals' actions, social interaction focuses more on the mutual influence between people. Although people can easily judge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "233", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:SSS, author = "Qi Liu and Xinchen Liu and Kun Liu and Xiaoyan Gu and Wu Liu", title = "{SigFormer}: Sparse Signal-guided Transformer for Multi-modal Action Segmentation", journal = j-TOMM, volume = "20", number = "8", pages = "234:1--234:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3657296", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3657296", abstract = "Multi-modal human action segmentation is a critical and challenging task with a wide range of applications. Nowadays, the majority of approaches concentrate on the fusion of dense signals (i.e., RGB, optical flow, and depth maps). However, the potential \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "234", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lyu:2024:DDB, author = "Jun Lyu and Shouang Yan and M. Shamim Hossain", title = "{DBGAN}: Dual Branch Generative Adversarial Network for Multi-Modal {MRI} Translation", journal = j-TOMM, volume = "20", number = "8", pages = "235:1--235:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3657298", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3657298", abstract = "Existing magnetic resonance imaging translation models rely on generative adversarial networks, primarily employing simple convolutional neural networks. Unfortunately, these networks struggle to capture global representations and contextual relationships \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "235", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:BDG, author = "Dejun Zhang and Mian Zhang and Xuefeng Tan and Jun Liu", title = "Bridging the Domain Gap in Scene Flow Estimation via Hierarchical Smoothness Refinement", journal = j-TOMM, volume = "20", number = "8", pages = "236:1--236:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661823", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3661823", abstract = "This article introduces SmoothFlowNet3D, an innovative encoder-decoder architecture specifically designed for bridging the domain gap in scene flow estimation. To achieve this goal, SmoothFlowNet3D divides the scene flow estimation task into two stages: \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "236", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:ISC, author = "Ning Chen and Zhipeng Cheng and Xuwei Fan and Zhang Liu and Bangzhen Huang and Yifeng Zhao and Lianfen Huang and Xiaojiang Du and Mohsen Guizani", title = "Integrated Sensing, Communication, and Computing for Cost-effective Multimodal Federated Perception", journal = j-TOMM, volume = "20", number = "8", pages = "237:1--237:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661313", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3661313", abstract = "Federated learning (FL) is a prominent paradigm of 6G edge intelligence (EI), which mitigates privacy breaches and high communication pressure caused by conventional centralized model training in the artificial intelligence of things (AIoT). The execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "237", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:LVC, author = "Jiayu Yang and Chunhui Yang and Fei Xiong and Yongqi Zhai and Ronggang Wang", title = "Learned Video Compression with Adaptive Temporal Prior and Decoded Motion-aided Quality Enhancement", journal = j-TOMM, volume = "20", number = "8", pages = "238:1--238:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661824", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3661824", abstract = "Learned video compression has drawn great attention and shown promising compression performance recently. In this article, we focus on the two components in the learned video compression framework, the conditional entropy model and quality enhancement \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "238", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gu:2024:RAF, author = "Xiaoling Gu and Junkai Zhu and Yongkang Wong and Zizhao Wu and Jun Yu and Jianping Fan and Mohan Kankanhalli", title = "Recurrent Appearance Flow for Occlusion-Free Virtual Try-On", journal = j-TOMM, volume = "20", number = "8", pages = "239:1--239:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659581", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3659581", abstract = "Image-based virtual try-on aims at transferring a target in-shop garment onto a reference person, and has garnered significant attention from the research communities recently. However, previous methods have faced severe challenges in handling occlusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "239", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lyu:2024:ISI, author = "Yuanjie Lyu and Penggang Qin and Tong Xu and Chen Zhu and Enhong Chen", title = "{InteractNet}: Social Interaction Recognition for Semantic-rich Videos", journal = j-TOMM, volume = "20", number = "8", pages = "240:1--240:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663668", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663668", abstract = "The overwhelming surge of online video platforms has raised an urgent need for social interaction recognition techniques. Compared with simple short-term actions, long-term social interactions in semantic-rich videos could reflect more complicated \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "240", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bhattacharjee:2024:ESM, author = "Mrinmoy Bhattacharjee and Prasanna Mahadeva S. R. and Prithwijit Guha", title = "Exploration of Speech and Music Information for Movie Genre Classification", journal = j-TOMM, volume = "20", number = "8", pages = "241:1--241:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664197", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664197", abstract = "Movie genre prediction from trailers is mostly attempted in a multi-modal manner. However, the characteristics of movie trailer audio indicate that this modality alone might be highly effective in genre prediction. Movie trailer audio predominantly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "241", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sarto:2024:TRA, author = "Sara Sarto and Marcella Cornia and Lorenzo Baraldi and Alessandro Nicolosi and Rita Cucchiara", title = "Towards Retrieval-Augmented Architectures for Image Captioning", journal = j-TOMM, volume = "20", number = "8", pages = "242:1--242:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663667", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663667", abstract = "The objective of image captioning models is to bridge the gap between the visual and linguistic modalities by generating natural language descriptions that accurately reflect the content of input images. In recent years, researchers have leveraged deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "242", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:PAP, author = "Kaihui Yang and Junwei Han and Guangyu Guo and Chaowei Fang and Yingzi Fan and Lechao Cheng and Dingwen Zhang", title = "Progressive Adapting and Pruning: Domain-Incremental Learning for Saliency Prediction", journal = j-TOMM, volume = "20", number = "8", pages = "243:1--243:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661312", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3661312", abstract = "Saliency prediction (SAP) plays a crucial role in simulating the visual perception function of human beings. In practical situations, humans can quickly grasp saliency extraction in new image domains. However, current SAP methods mainly concentrate on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "243", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2024:HED, author = "Lv Tang and Xinfeng Zhang", title = "High Efficiency Deep-learning Based Video Compression", journal = j-TOMM, volume = "20", number = "8", pages = "244:1--244:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661311", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3661311", abstract = "Although deep learning technique has achieved significant improvement on image compression, but its advantages are not fully explored in video compression, which leads to the performance of deep-learning-based video compression (DLVC) is obviously \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "244", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gomes:2024:AAG, author = "Pedro de Medeiros Gomes and Silvia Rossi and Laura Toni", title = "{AGAR} --- Attention Graph-{RNN} for Adaptative Motion Prediction of Point Clouds of Deformable Objects", journal = j-TOMM, volume = "20", number = "8", pages = "245:1--245:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3662183", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3662183", abstract = "This article focuses on motion prediction for point cloud sequences in the challenging case of deformable 3D objects, such as human body motion. First, we investigate the challenges caused by deformable shapes and complex motions present in this type of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "245", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ye:2024:UUR, author = "Jiabo Ye and Junfeng Tian and Ming Yan and Haiyang Xu and Qinghao Ye and Yaya Shi and Xiaoshan Yang and Xuwu Wang and Ji Zhang and Liang He and Xin Lin", title = "{UniQRNet}: Unifying Referring Expression Grounding and Segmentation with {QRNet}", journal = j-TOMM, volume = "20", number = "8", pages = "246:1--246:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3660638", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3660638", abstract = "Referring expression comprehension aims to align natural language queries with visual scenes, which requires establishing fine-grained correspondence between vision and language. This has important applications in multi-modal reasoning systems. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "246", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:BQA, author = "Wei Zhou and Qi Yang and Wu Chen and Qiuping Jiang and Guangtao Zhai and Weisi Lin", title = "Blind Quality Assessment of Dense {$3$D} Point Clouds with Structure Guided Resampling", journal = j-TOMM, volume = "20", number = "8", pages = "247:1--247:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664199", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664199", abstract = "Objective quality assessment of three-dimensional (3D) point clouds is essential for the development of immersive multimedia systems in real-world applications. Despite the success of perceptual quality evaluation for 2D images and videos, blind/no-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "247", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2024:EWZ, author = "Yuli Zhao and Yin Zhang and Francis C. M. Lau and Hai Yu and Zhiliang Zhu and Bin Zhang", title = "Expanding-Window Zigzag Decodable Fountain Codes for Scalable Multimedia Transmission", journal = j-TOMM, volume = "20", number = "8", pages = "248:1--248:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664610", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664610", abstract = "In this article, we present a coding method called expanding-window zigzag decodable fountain code with unequal error protection property (EWF-ZD UEP code) to achieve scalable multimedia transmission. The key idea of the EWF-ZD UEP code is to utilize bit-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "248", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jin:2024:USR, author = "Xuanyu Jin and Ni Li and Wanzeng Kong and Jiajia Tang and Bing Yang", title = "Unbiased Semantic Representation Learning Based on Causal Disentanglement for Domain Generalization", journal = j-TOMM, volume = "20", number = "8", pages = "249:1--249:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659953", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3659953", abstract = "Domain generalization primarily mitigates domain shift among multiple source domains, generalizing the trained model to an unseen target domain. However, the spurious correlation usually caused by context prior (e.g., background) makes it challenging to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "249", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2024:SSM, author = "Bo Peng and Lin Sun and Jianjun Lei and Bingzheng Liu and Haifeng Shen and Wanqing Li and Qingming Huang", title = "Self-Supervised Monocular Depth Estimation via Binocular Geometric Correlation Learning", journal = j-TOMM, volume = "20", number = "8", pages = "250:1--250:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663570", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663570", abstract = "Monocular depth estimation aims to infer a depth map from a single image. Although supervised learning-based methods have achieved remarkable performance, they generally rely on a large amount of labor-intensively annotated data. Self-supervised methods, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "250", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:DPJ, author = "Yang Yang and Shuailong Qiu and Lanling Zeng and Zhigeng Pan", title = "Detail-preserving Joint Image Upsampling", journal = j-TOMM, volume = "20", number = "8", pages = "251:1--251:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665246", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665246", abstract = "Image operators can be instrumental to computational imaging and photography. However, many of them are computationally intensive. In this article, we propose an effective yet efficient joint upsampling method to accelerate various image operators. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "251", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kang:2024:OCM, author = "Xiao Kang and Xingbo Liu and Wen Xue and Xiushan Nie and Yilong Yin", title = "Online Cross-modal Hashing With Dynamic Prototype", journal = j-TOMM, volume = "20", number = "8", pages = "252:1--252:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665249", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665249", abstract = "Online cross-modal hashing has received increasing attention due to its efficiency and effectiveness in handling cross-modal streaming data retrieval. Despite the promising performance, these methods mainly focus on the supervised learning paradigm, demanding expensive and laborious work to obtain clean annotated data. Existing unsupervised online hashing methods mostly struggle to construct instructive semantic correlations among data chunks, resulting in the forgetting of accumulated data distribution. To this end, we propose a Dynamic Prototype-based Online Cross-modal Hashing method, called DPOCH. Based on the pre-learned reliable common representations, DPOCH generates prototypes incrementally as sketches of accumulated data and updates them dynamically for adapting streaming data. Thereafter, the prototype-based semantic embedding and similarity graphs are designed to promote stability and generalization of the hashing process, thereby obtaining globally adaptive hash codes and hash functions. Experimental results on benchmarked datasets demonstrate that the proposed DPOCH outperforms state-of-the-art unsupervised online cross-modal hashing methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "252", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:SFS, author = "Yuqing Yang and Boris Joukovsky and Jos{\'e} Oramas Mogrovejo and Tinne Tuytelaars and Nikos Deligiannis", title = "{SNIPPET}: a Framework for Subjective Evaluation of Visual Explanations Applied to {DeepFake} Detection", journal = j-TOMM, volume = "20", number = "8", pages = "253:1--253:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665248", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665248", abstract = "Explainable Artificial Intelligence (XAI) attempts to help humans understand machine learning decisions better and has been identified as a critical component toward increasing the trustworthiness of complex black-box systems, such as deep neural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "253", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2024:IAL, author = "Jinwang Pan and Xianming Liu and Yuanchao Bai and Deming Zhai and Junjun Jiang and Debin Zhao", title = "Illumination-Aware Low-Light Image Enhancement with Transformer and Auto-Knee Curve", journal = j-TOMM, volume = "20", number = "8", pages = "254:1--254:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664653", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664653", abstract = "Images captured under low-light conditions suffer from several combined degradation factors, including low brightness, low contrast, noise, and color bias. Many learning-based techniques attempt to learn the low-to-clear mapping between low-light and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "254", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tiotsop:2024:MID, author = "Lohic Fotio Tiotsop and Antonio Servetti and Peter Pocta and Glenn {Van Wallendael} and Marcus Barkowsky and Enrico Masala", title = "Multiple Image Distortion {DNN} Modeling Individual Subject Quality Assessment", journal = j-TOMM, volume = "20", number = "8", pages = "255:1--255:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664198", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664198", abstract = "A recent research direction is focused on training Deep Neural Networks (DNNs) to replicate individual subject assessments of media quality. These DNNs are referred to as Artificial Intelligence-based Observers (AIOs). An AIO is designed to simulate, in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "255", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2024:HHK, author = "Yunhui Xu and Youru Li and Muhao Xu and Zhenfeng Zhu and Yao Zhao", title = "{HKA}: a Hierarchical Knowledge Alignment Framework for Multimodal Knowledge Graph Completion", journal = j-TOMM, volume = "20", number = "8", pages = "256:1--256:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664288", abstract = "Recent years have witnessed the successful application of knowledge graph techniques in structured data processing, while how to incorporate knowledge from visual and textual modalities into knowledge graphs has been given less attention. To better \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "256", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:MFG, author = "Li Zhou and Zhenyu Liu and Yutong Li and Yuchi Duan and Huimin Yu and Bin Hu", title = "Multi Fine-Grained Fusion Network for Depression Detection", journal = j-TOMM, volume = "20", number = "8", pages = "257:1--257:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665247", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665247", abstract = "Depression is an illness that involves emotional and mental health. Currently, depression detection through interviews is the most popular way. With the advancement of natural language processing and sentiment analysis, automated interview-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "257", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2024:CTI, author = "Chenlei Lv and Dan Zhang and Shengling Geng and Zhongke Wu and Hui Huang", title = "Color Transfer for Images: a Survey", journal = j-TOMM, volume = "20", number = "8", pages = "258:1--258:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635152", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635152", abstract = "High-quality image generation is an important topic in digital visualization. As a sub-topic of the research, color transfer is to produce a high-quality image with ideal color scheme learned from the reference one. In this article, we investigate the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "258", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:RAR, author = "Zhihao Zhang and Jun Wang and Shengjie Li and Lei Jin and Hao Wu and Jian Zhao and Bo Zhang", title = "Review and Analysis of {RGBT} Single Object Tracking Methods: a Fusion Perspective", journal = j-TOMM, volume = "20", number = "8", pages = "259:1--259:??", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3651308", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Aug 28 06:37:02 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3651308", abstract = "Visual tracking is a fundamental task in computer vision with significant practical applications in various domains, including surveillance, security, robotics, and human-computer interaction. However, it may face limitations in visible light data, such \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "259", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:CST, author = "Yuantong Zhang and Daiqin Yang and Zhenzhong Chen and Wenpeng Ding", title = "Continuous Space-Time Video Super-Resolution with Multi-Stage Motion Information Reorganization", journal = j-TOMM, volume = "20", number = "9", pages = "273:1--273:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665646", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665646", abstract = "Space-time video super-resolution (ST-VSR) aims to simultaneously expand a given source video to a higher frame rate and resolution. However, most existing schemes either consider fixed intermediate time and scale or fail to exploit long-range temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "273", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2024:DAT, author = "Caijuan Shi and Yuanfan Zheng and Zhen Chen", title = "Domain Adaptive Thermal Object Detection with Unbiased Granularity Alignment", journal = j-TOMM, volume = "20", number = "9", pages = "274:1--274:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665892", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665892", abstract = "Domain Adaptive Object Detection (DAOD) alleviates the challenge of labor-intensive annotations by transferring semantic information from a labeled source domain to an unlabeled target domain. However, the DAOD suffers from biased discrimination and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "274", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:AAS, author = "Ziyi Liu and You Yang and Kejun Wu and Qiong Liu and Xinghua Xu and Xiaoxuan Ma and Jiang Tang", title = "{ASIFusion}: an Adaptive Saliency Injection-Based Infrared and Visible Image Fusion Network", journal = j-TOMM, volume = "20", number = "9", pages = "275:1--275:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665893", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665893", abstract = "The purpose of infrared and visible image fusion (IVIF) is to acquire a more informative fused image by leveraging complementary information, facilitating human perception and machine vision. Among the existing fusion methods, the saliency-based methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "275", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:LAC, author = "Xu Wu and Zhihui Lai and Jie Zhou and Xianxu Hou and Witold Pedrycz and Linlin Shen", title = "Light-Aware Contrastive Learning for Low-Light Image Enhancement", journal = j-TOMM, volume = "20", number = "9", pages = "276:1--276:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665498", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665498", abstract = "Low-Light Image Enhancement (LLIE) presents challenges due to texture information loss and uneven illumination, which can distort feature distribution and reduce the quality of the enhanced images. However, current deep learning methods for LLIE only use \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "276", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bandung:2024:IVD, author = "Yoanes Bandung and Mokhamad Arfan Wicaksono and Sean Pribadi and Armein Z. R. Langi and Dion Tanjung", title = "{IoT} Video Delivery Optimization through Machine Learning-Based Frame Resolution Adjustment", journal = j-TOMM, volume = "20", number = "9", pages = "277:1--277:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665929", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665929", abstract = "Providing acceptable video quality in the Internet of Things (IoT) implementation poses a significant challenge, mainly when the application is performed on low-cost and low-power devices. This research focuses on developing a frame resolution adjustment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "277", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2024:ANR, author = "Jingwei Ma and Kangkang Bian and Yang Xu and Lei Zhu", title = "{ANAGL}: a Noise-Resistant and Anti-Sparse Graph Learning for Micro-Video Recommendation", journal = j-TOMM, volume = "20", number = "9", pages = "278:1--278:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3670407", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3670407", abstract = "In recent years, graph convolutional networks (GCNs) have seen widespread utilization within micro-video recommendation systems, facilitating the understanding of user preferences through interactions with micro-videos. Despite the commendable performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "278", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:VVB, author = "Wuyang Chen and Boqing Zhu and Kele Xu and Yong Dou and Dawei Feng", title = "{VoiceStyle}: Voice-Based Face Generation via Cross-Modal Prototype Contrastive Learning", journal = j-TOMM, volume = "20", number = "9", pages = "279:1--279:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3671002", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3671002", abstract = "Can we predict a person's appearance solely based on their voice? This article explores this question by focusing on generating a face from an unheard voice segment. Our proposed method, VoiceStyle, combines cross-modal representation learning with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "279", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cai:2024:TAC, author = "Chen Cai and Kim-Hui Yap and Suchen Wang", title = "Toward Attribute-Controlled Fashion Image Captioning", journal = j-TOMM, volume = "20", number = "9", pages = "280:1--280:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3671000", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3671000", abstract = "Fashion image captioning is a critical task in the fashion industry that aims to automatically generate product descriptions for fashion items. However, existing fashion image captioning models predict a fixed caption for a particular fashion item once \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "280", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lv:2024:SVI, author = "Kai Lv and Haobo Chen and Chuyang Zhao and Kai Tu and Junru Chen and Yadong Li and Boxun Li and Youfang Lin", title = "Style Variable and Irrelevant Learning for Generalizable Person Re-identification", journal = j-TOMM, volume = "20", number = "9", pages = "281:1--281:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3671003", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3671003", abstract = "Domain generalization person re-identification (DG-ReID) has gained much attention recently due to the poor performance of supervised re-identification on unseen domains. The goal of domain generalization is to develop a model that is insensitive to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "281", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:SSC, author = "Mengran Li and Ronghui Zhang and Yong Zhang and Xinglin Piao and Shiyu Zhao and Baocai Yin", title = "{SCAE}: Structural Contrastive Auto-Encoder for Incomplete Multi-View Representation Learning", journal = j-TOMM, volume = "20", number = "9", pages = "282:1--282:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672078", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672078", abstract = "Describing an object from multiple perspectives often leads to incomplete data representation. Consequently, learning consistent representations for missing data from multiple views has emerged as a key focus in the realm of Incomplete Multi-view \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "282", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:MDE, author = "Hanzhang Wang and Deming Zhai and Xiong Zhou and Junjun Jiang and Xianming Liu", title = "{Mix-DDPM}: Enhancing Diffusion Models through Fitting Mixture Noise with Global Stochastic Offset", journal = j-TOMM, volume = "20", number = "9", pages = "283:1--283:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672080", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672080", abstract = "Denoising diffusion probabilistic models (DDPM) have shown impressive performance in various domains as a class of deep generative models. In this article, we introduce the mixture noise-based DDPM (Mix-DDPM), which considers the Markov diffusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "283", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hou:2024:TLF, author = "Wenxuan Hou and Guangyao Li and Yapeng Tian and Di Hu", title = "Toward Long Form Audio-Visual Video Understanding", journal = j-TOMM, volume = "20", number = "9", pages = "284:1--284:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672079", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672079", abstract = "We live in a world filled with never-ending streams of multimodal information. As a more natural recording of the real scenario, long form audio-visual videos (LFAVs) are expected as an important bridge for better exploring and understanding the world. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "284", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2024:MIQ, author = "Encheng Yu and Jianer Zhou and Zhenyu Li and Gareth Tyson and Weichao Li and Xinyi Zhang and Zhiwei Xu and Gaogang Xie", title = "Mustang: Improving {QoE} for Real-Time Video in Cellular Networks by Masking Jitter", journal = j-TOMM, volume = "20", number = "9", pages = "285:1--285:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672399", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672399", abstract = "The advent of 5G and interactive live broadcasting has led to a growing trend of people preferring real-time interactive video services on mobile devices, particularly mobile phones. In this work, we measure the performance of Google congestion control, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "285", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:MPC, author = "Yan Li and Xiangyuan Lan and Haifeng Chen and Ke Lu and Dongmei Jiang", title = "Multimodal {PEAR} Chain-of-Thought Reasoning for Multimodal Sentiment Analysis", journal = j-TOMM, volume = "20", number = "9", pages = "286:1--286:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672398", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672398", abstract = "Multimodal sentiment analysis aims to predict sentiments from multimodal signals such as audio, video, and text. Existing methods often rely on Pre-trained Language Models (PLMs) to extract semantic information from textual data, lacking an in-depth \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "286", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:BSS, author = "Zechen Liang and Yuan-Gen Wang and Wei Lu and Xiaochun Cao", title = "Boosting Semi-Supervised Learning with Dual-Threshold Screening and Similarity Learning", journal = j-TOMM, volume = "20", number = "9", pages = "287:1--287:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672563", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672563", abstract = "How to effectively utilize unlabeled data for training is a key problem in Semi-Supervised Learning (SSL). Existing SSL methods often consider the unlabeled data whose predictions are beyond a fixed threshold (e.g., 0.95) and discard those less than 0.95. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "287", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:SSE, author = "Chen Chen and Lingfeng Qu and Hadi Amirpour and Xingjun Wang and Christian Timmerer and Zhihong Tian", title = "On the Security of Selectively Encrypted {HEVC} Video Bitstreams", journal = j-TOMM, volume = "20", number = "9", pages = "288:1--288:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672568", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672568", abstract = "With the growing applications of video, ensuring its security has become of utmost importance. Selective encryption (SE) has gained significant attention in the field of video content protection due to its compatibility with video codecs, favorable visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "288", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qin:2024:MGP, author = "Tai Qin and Ge Li and Wei Gao and Shan Liu", title = "Multi-Grained Point Cloud Geometry Compression via Dual-Model Prediction with Extended Octree", journal = j-TOMM, volume = "20", number = "9", pages = "289:1--289:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3671001", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3671001", abstract = "The state-of-the-art geometry-based point cloud compression (G-PCC) (Octree) is the fine-grained approach, which uses the octree to partition point clouds into voxels and predicts them based on neighbor occupancy in narrower spaces. However, G-PCC (Octree). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "289", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:LDI, author = "Jiehua Zhang and Liang Li and Chenggang Yan and Zhan Wang and Changliang Xu and Jiyong Zhang and Chuqiao Chen", title = "Learning Domain Invariant Features for Unsupervised Indoor Depth Estimation Adaptation", journal = j-TOMM, volume = "20", number = "9", pages = "290:1--290:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672397", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672397", abstract = "Predicting depth maps from monocular images has made an impressive performance in the past years. However, most depth estimation methods are trained with paired image-depth map data or multi-view images (e.g., stereo pair and monocular sequence), which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "290", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2024:CPC, author = "Yiling Xu and Yujie Zhang and Qi Yang and Xiaozhong Xu and Shan Liu", title = "Compressed Point Cloud Quality Index by Combining Global Appearance and Local Details", journal = j-TOMM, volume = "20", number = "9", pages = "291:1--291:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672567", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672567", abstract = "In recent years, many standardized algorithms for point cloud compression (PCC) has been developed and achieved remarkable compression ratios. To provide guidance for rate-distortion optimization and codec evaluation, point cloud quality assessment (PCQA) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "291", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:MFT, author = "Zhilei Liu and Xiaoxing Liu and Sen Chen and Jiaxing Liu and Longbiao Wang and Chongke Bi", title = "Multimodal Fusion for Talking Face Generation Utilizing Speech-Related Facial Action Units", journal = j-TOMM, volume = "20", number = "9", pages = "292:1--292:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672565", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672565", abstract = "Talking face generation is to synthesize a lip-synchronized talking face video by inputting an arbitrary face image and corresponding audio clips. The current talking face model can be divided into four parts: visual feature extraction, audio feature \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "292", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:KVK, author = "Zizhao Wu and Siyu Liu and Peioyan Lu and Ping Yang and Yongkang Wong and Xiaoling Gu and Mohan S. Kankanhalli", title = "{KF-VTON}: Keypoints-Driven Flow Based Virtual Try-On Network", journal = j-TOMM, volume = "20", number = "9", pages = "293:1--293:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673903", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3673903", abstract = "Image-based virtual try-on aims to fit a target garment to a reference person. Most existing methods are limited to solving the Garment-To-Person (G2P) try-on task that transfers a garment from a clean product image to the reference person and do not \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "293", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhuo:2024:UVE, author = "Linhai Zhuo and Yuqian Fu and Jingjing Chen and Yixin Cao and Yu-Gang Jiang", title = "Unified View Empirical Study for Large Pretrained Model on Cross-Domain Few-Shot Learning", journal = j-TOMM, volume = "20", number = "9", pages = "294:1--294:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673231", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3673231", abstract = "The challenge of cross-domain few-shot learning (CD-FSL) stems from the substantial distribution disparities between target and source domain images, necessitating a model with robust generalization capabilities. In this work, we posit that large-scale \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "294", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zuo:2024:PEP, author = "Ruifan Zuo and Chaoqun Zheng and Fengling Li and Lei Zhu and Zheng Zhang", title = "Privacy-Enhanced Prototype-Based Federated Cross-Modal Hashing for Cross-Modal Retrieval", journal = j-TOMM, volume = "20", number = "9", pages = "295:1--295:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674507", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674507", abstract = "Cross-modal hashing is widely used for efficient similarity searches, improving data processing efficiency, and reducing storage costs. Existing cross-modal hashing methods primarily focus on centralized training scenarios, where fixed-scale and fixed-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "295", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2024:TDV, author = "Xue Song and Jingjing Chen and Bin Zhu and Yu-Gang Jiang", title = "Text-Driven Video Prediction", journal = j-TOMM, volume = "20", number = "9", pages = "296:1--296:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3675171", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Sep 24 06:42:16 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3675171", abstract = "Current video generation models usually convert signals indicating appearance and motion received from inputs (e.g., image and text) or latent spaces (e.g., noise vectors) into consecutive frames, fulfilling a stochastic generation process for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "296", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hussain:2024:SYA, author = "Walayat Hussain and Honghao Gao and Rafiul Karim and Abdulmotaleb El Saddik", title = "Seventeen Years of the {{\booktitle{ACM Transactions on Multimedia Computing, Communications and Applications}}}: a Bibliometric Overview", journal = j-TOMM, volume = "20", number = "10", pages = "297:1--297:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3660347", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:43:12 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3660347", abstract = "ACM Transactions on Multimedia Computing, Communications, and Applications has been dedicated to advancing multimedia research, fostering discoveries, innovations, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "297", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2024:UFL, author = "Bowen Yuan and Jiahao Lu and Sisi You and Bing-Kun Bao", title = "Unbiased Feature Learning with Causal Intervention for Visible-Infrared Person Re-Identification", journal = j-TOMM, volume = "20", number = "10", pages = "298:1--298:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674737", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674737", abstract = "Visible-infrared person re-identification (VI-ReID) aims to match individuals across different modalities. Existing methods can learn class-separable features but still struggle with modality gaps within class due to the modality-specific information, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "298", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chan:2024:AFF, author = "Sixian Chan and Xianpeng Zeng and Xinhua Wang and Jie Hu and Cong Bai", title = "Auxiliary Feature Fusion and Noise Suppression for {HOI} Detection", journal = j-TOMM, volume = "20", number = "10", pages = "299:1--299:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674980", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674980", abstract = "In recent years, one-stage HOI (Human-Object Interaction) detection methods tend to divide the original task into multiple sub-tasks by using a multi-branch network structure. However, there is no sufficient attention to information communication between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "299", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:GMM, author = "Yefan Li and Fuqing Duan and Ke Lu", title = "Gated Multi-Modal Edge Refinement Network for Light Field Salient Object Detection", journal = j-TOMM, volume = "20", number = "10", pages = "300:1--300:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674836", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674836", abstract = "Light field can be decoded into multiple representations and provides valuable focus and depth information. This breakthrough overcomes the limitations of traditional 2D and 3D saliency detection methods, opening up new possibilities for more accurate and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "300", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hao:2024:HHC, author = "Dongze Hao and Qunbo Wang and Xinxin Zhu and Jing Liu", title = "{HCCL}: Hierarchical Counterfactual Contrastive Learning for Robust Visual Question Answering", journal = j-TOMM, volume = "20", number = "10", pages = "301:1--301:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673902", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3673902", abstract = "Despite most state-of-the-art models having achieved amazing performance in Visual Question Answering (VQA), they usually utilize biases to answer the question. Recently, some studies synthesize counterfactual training samples to help the model to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "301", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jia:2024:HBS, author = "Jun Jia and Zhongpai Gao and Yiwei Yang and Wei Sun and Dandan Zhu and Xiaohong Liu and Xiongkuo Min and Guangtao Zhai", title = "Hidden Barcode in Sub-Images with Invisible Locating Marker", journal = j-TOMM, volume = "20", number = "10", pages = "302:1--302:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674976", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674976", abstract = "The prevalence of the Internet of Things has led to the widespread adoption of 2D barcodes as a means of offline-to-online communication. Whereas, 2D barcodes are not ideal for publicity materials due to their space-consuming nature. Recent works have \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "302", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lu:2024:MDE, author = "Junxin Lu and Yongbin Gao and Jieyu Chen and Jeng-Neng Hwang and Hamido Fujita and Zhijun Fang", title = "Monocular Depth and Ego-motion Estimation with Scale Based on Superpixel and Normal Constraints", journal = j-TOMM, volume = "20", number = "10", pages = "303:1--303:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674977", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674977", abstract = "Three-dimensional perception in intelligent virtual and augmented reality (VR/AR) and autonomous vehicles (AV) applications is critical and attracting significant attention. The self-supervised monocular depth and ego-motion estimation serves as a more \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "303", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2024:DYC, author = "Zhenjiang Guo and Xiaohai He and Yu Yang and Linbo Qing and Honggang Chen", title = "{DAG-YOLO}: a Context-Feature Adaptive fusion Rotating Detection Network in Remote Sensing Images", journal = j-TOMM, volume = "20", number = "10", pages = "304:1--304:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674978", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674978", abstract = "Object detection in remote sensing image (RSI) research has seen significant advancements, particularly with the advent of deep learning. However, challenges such as orientation, scale, aspect ratio variations, dense object distribution, and category \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "304", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:MML, author = "Yong Zhou and Zeming Xie and Jiaqi Zhao and Wenliang Du and Rui Yao and Abdulmotaleb {El Saddik}", title = "Multi-Modal {LiDAR} Point Cloud Semantic Segmentation with Salience Refinement and Boundary Perception", journal = j-TOMM, volume = "20", number = "10", pages = "305:1--305:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674979", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674979", abstract = "Point cloud segmentation is essential for scene understanding, which provides advanced information for many applications, such as autonomous driving, robots, and virtual reality. To improve the accuracy and robustness of point cloud segmentation, many \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "305", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:HRS, author = "Yuanyuan Wang and Meng Liu and Xuemeng Song and Liqiang Nie", title = "Harnessing Representative Spatial-Temporal Information for Video Question Answering", journal = j-TOMM, volume = "20", number = "10", pages = "306:1--306:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3675399", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3675399", abstract = "Video question answering, aiming to answer a natural language question related to the given video, has become prevalent in the past few years. Although remarkable improvements have been obtained, it is still exposed to the challenge of insufficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "306", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liao:2024:RFM, author = "Guibiao Liao and Wei Gao", title = "Rethinking Feature Mining for Light Field Salient Object Detection", journal = j-TOMM, volume = "20", number = "10", pages = "307:1--307:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3676967", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3676967", abstract = "Light field salient object detection (LF SOD) has recently received increasing attention. However, most current works typically rely on an individual focal stack backbone for feature extraction. This manner ignores the characteristic of blurred saliency-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "307", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:NTH, author = "Chao Liang and Linchao Zhu and Zongxin Yang and Wei Chen and Yi Yang", title = "Noise-Tolerant Hybrid Prototypical Learning with Noisy {Web} Data", journal = j-TOMM, volume = "20", number = "10", pages = "308:1--308:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672396", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672396", abstract = "We focus on the challenging problem of learning an unbiased classifier from a large number of potentially relevant but noisily labeled web images given only a few clean labeled images. This problem is particularly practical because it reduces the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "308", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2024:DDL, author = "Yitao Peng and Lianghua He and Die Hu and Yihang Liu and Longzhen Yang and Shaohua Shang", title = "Decoupling Deep Learning for Enhanced Image Recognition Interpretability", journal = j-TOMM, volume = "20", number = "10", pages = "309:1--309:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674837", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3674837", abstract = "The quest for enhancing the interpretability of neural networks has become a prominent focus in recent research endeavors. Prototype-based neural networks have emerged as a promising avenue for imbuing models with interpretability by gauging the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "309", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2024:DDC, author = "Baoli Sun and Yanjun Guo and Tiantian Yan and Xinchen Ye and Zhihui Wang and Haojie Li and Zhiyong Wang", title = "Digging into Depth and Color Spaces: a Mapping Constraint Network for Depth Super-Resolution", journal = j-TOMM, volume = "20", number = "10", pages = "310:1--310:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3677123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3677123", abstract = "Scene depth super-resolution (DSR) poses an inherently ill-posed problem due to the extremely large space of one-to-many mapping functions from a given low-resolution (LR) depth map, which possesses limited depth information, to multiple plausible high-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "310", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Seufert:2024:COC, author = "Michael Seufert and Marius Spangenberger and Fabian Poign{\'e}e and Florian Wamser and Werner Robitza and Christian Timmerer and Tobias Ho{\ss}feld", title = "{COBIRAS}: Offering a Continuous Bit Rate Slide to Maximize {DASH} Streaming Bandwidth Utilization", journal = j-TOMM, volume = "20", number = "10", pages = "311:1--311:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3677379", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3677379", abstract = "Reaching close-to-optimal bandwidth utilization in dynamic adaptive streaming over HTTP (DASH) systems can, in theory, be achieved with a small discrete set of bit rate representations. This includes typical bit rate ladders used in state-of-the-art DASH \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "311", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2024:MLF, author = "Zhangyong Tang and Tianyang Xu and Xiao-Jun Wu and Josef Kittler", title = "Multi-Level Fusion for Robust {RGBT} Tracking via Enhanced Thermal Representation", journal = j-TOMM, volume = "20", number = "10", pages = "312:1--312:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678176", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678176", abstract = "Due to the limitations of visible (RGB) sensors in challenging scenarios, such as nighttime and foggy environments, the thermal infrared (TIR) modality draws increasing attention as an auxiliary source for robust tracking systems. Currently, the existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "312", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tu:2024:RFI, author = "Hanyue Tu and Li Li and Wengang Zhou and Houqiang Li", title = "Reconstruction-Free Image Compression for Machine Vision via Knowledge Transfer", journal = j-TOMM, volume = "20", number = "10", pages = "313:1--313:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678471", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678471", abstract = "Reconstruction-free image compression for machine vision aims to perform machine vision tasks directly on compressed-domain representations instead of reconstructed images. Existing reports have validated the feasibility of compressed-domain machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "313", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:USD, author = "Gai Zhang and Xinfeng Zhang and Lv Tang", title = "Unified and Scalable Deep Image Compression Framework for Human and Machine", journal = j-TOMM, volume = "20", number = "10", pages = "314:1--314:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678472", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678472", abstract = "Image compression aims to minimize the amount of data in image representation while maintaining a certain visual quality for humans, which is an essential technique for storage and transmission. Recently, along with the development of computer vision, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "314", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:LCA, author = "Fengyong Li and Huajun Zhai and Teng Liu and Xinpeng Zhang and Chuan Qin", title = "Learning Compressed Artifact for {JPEG} Manipulation Localization Using Wide-Receptive-Field Network", journal = j-TOMM, volume = "20", number = "10", pages = "315:1--315:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678883", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678883", abstract = "JPEG image manipulation localization aims to accurately classify and locate tampered regions in JPEG images. Existing image manipulation localization schemes usually consider diverse data streams of spatial domain, e.g. noise inconsistency and local \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "315", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yin:2024:EIL, author = "Shukang Yin and Sirui Zhao and Hao Wang and Tong Xu and Enhong Chen", title = "Exploiting Instance-level Relationships in Weakly Supervised Text-to-Video Retrieval", journal = j-TOMM, volume = "20", number = "10", pages = "316:1--316:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663571", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663571", abstract = "Text-to-Video Retrieval is a typical cross-modal retrieval task that has been studied extensively under a conventional supervised setting. Recently, some works have sought to extend the problem to a weakly supervised formulation, which can be more \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "316", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Latifzadeh:2024:EDA, author = "Kayhan Latifzadeh and Nima Gozalpour and V. Javier Traver and Tuukka Ruotsalo and Aleksandra Kawala-Sterniuk and Luis A. Leiva", title = "Efficient Decoding of Affective States from Video-elicited {EEG} Signals: an Empirical Investigation", journal = j-TOMM, volume = "20", number = "10", pages = "317:1--317:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663669", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663669", abstract = "Affect decoding through brain-computer interfacing (BCI) holds great potential to capture users' feelings and emotional responses via non-invasive electroencephalogram (EEG) sensing. Yet, little research has been conducted to understand efficient decoding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "317", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:LCA, author = "Ziyue Wu and Junyu Gao and Shucheng Huang and Changsheng Xu", title = "Learning Commonsense-aware Moment-Text Alignment for Fast Video Temporal Grounding", journal = j-TOMM, volume = "20", number = "10", pages = "318:1--318:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663368", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3663368", abstract = "Grounding temporal video segments described in natural language queries effectively and efficiently is a crucial capability needed in vision-and-language fields. In this article, we deal with the fast video temporal grounding (FVTG) task, aiming at \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "318", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lorenzi:2024:MDC, author = "Daniele Lorenzi and Farzad Tashtarian and Hermann Hellwagner and Christian Timmerer", title = "{MEDUSA}: a Dynamic Codec Switching Approach in {HTTP} Adaptive Streaming", journal = j-TOMM, volume = "20", number = "10", pages = "319:1--319:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656175", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3656175", abstract = "HTTP Adaptive Streaming (HAS) solutions utilize various Adaptive BitRate (ABR) algorithms to dynamically select appropriate video representations, aiming at adapting to fluctuations in network bandwidth. However, current ABR implementations have a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "319", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pi:2024:EVA, author = "Ruoyan Pi and Peng Wu and Xiangteng He and Yuxin Peng", title = "{EOGT}: Video Anomaly Detection with Enhanced Object Information and Global Temporal Dependency", journal = j-TOMM, volume = "20", number = "10", pages = "320:1--320:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3662185", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3662185", abstract = "Video anomaly detection (VAD) aims to identify events or scenes in videos that deviate from typical patterns. Existing approaches primarily focus on reconstructing or predicting frames to detect anomalies and have shown improved performance in recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "320", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yue:2024:MGR, author = "Shengbin Yue and Yunbin Tu and Liang Li and Shengxiang Gao and Zhengtao Yu", title = "Multi-Grained Representation Aggregating Transformer with Gating Cycle for Change Captioning", journal = j-TOMM, volume = "20", number = "10", pages = "321:1--321:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3660346", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3660346", abstract = "Change captioning aims to describe the difference within an image pair in natural language, which combines visual comprehension and language generation. Although significant progress has been achieved, it remains a key challenge of perceiving the object \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "321", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:ADS, author = "Jingjing Wu and Xi Zhou and Xiaohong Li and Hao Liu and Meibin Qi and Richang Hong", title = "Asymmetric Deformable Spatio-temporal Framework for Infrared Object Tracking", journal = j-TOMM, volume = "20", number = "10", pages = "322:1--322:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678882", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678882", abstract = "The Infrared Object Tracking (IOT) task aims to locate objects in infrared sequences. Since color and texture information is unavailable in infrared modality, most existing infrared trackers merely rely on capturing spatial contexts from the image to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "322", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:DPP, author = "Zhenyu Li and Shanshan Gao and Deqian Mao and Shouwen Song and Lei Li and Yuanfeng Zhou", title = "Deep Plug-and-Play Non-Iterative Cluster for {$3$D} Global Feature Extraction", journal = j-TOMM, volume = "20", number = "10", pages = "323:1--323:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3679204", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3679204", abstract = "Efficient and accurate point cloud feature extraction is crucial for critical tasks such as 3D recognition and semantic segmentation. However, existing global feature extraction methods for 3D data often require designing different models for different \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "323", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xue:2024:SAA, author = "Mingfu Xue and Yinghao Wu and Leo Yu Zhang and Dujuan Gu and Yushu Zhang and Weiqiang Liu", title = "{SSAT}: Active Authorization Control and {User}'s Fingerprint Tracking Framework for {DNN IP} Protection", journal = j-TOMM, volume = "20", number = "10", pages = "324:1--324:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3679202", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3679202", abstract = "As training a high-performance deep neural network (DNN) model requires a large amount of data, powerful computing resources and expert knowledge, protecting well-trained DNN models from intellectual property (IP) infringement has raised serious concerns \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "324", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:FSF, author = "Yongkang Li and Qifan Liang and Zhen Han and Wenjun Mai and Zhongyuan Wang", title = "Few-Shot Face Sketch-to-Photo Synthesis via Global-Local Asymmetric Image-to-Image Translation", journal = j-TOMM, volume = "20", number = "10", pages = "325:1--325:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672400", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672400", abstract = "Face sketch-to-photo synthesis is widely used in law enforcement and digital entertainment, which can be achieved by Image-to-Image (I2I) translation. Traditional I2I translation algorithms usually regard the bidirectional translation of two image domains \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "325", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:AAL, author = "Shuqin Chen and Xian Zhong and Yi Zhang and Lei Zhu and Ping Li and Xiaokang Yang and Bin Sheng", title = "Action-aware Linguistic Skeleton Optimization Network for Non-autoregressive Video Captioning", journal = j-TOMM, volume = "20", number = "10", pages = "326:1--326:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3679203", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3679203", abstract = "Non-autoregressive video captioning methods generate visual words in parallel but often overlook semantic correlations among them, especially regarding verbs, leading to lower caption quality. To address this, we integrate action information of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "326", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2024:LFR, author = "Yancun Yang and Weiqing Min and Jingru Song and Guorui Sheng and Lili Wang and Shuqiang Jiang", title = "Lightweight Food Recognition via Aggregation Block and Feature Encoding", journal = j-TOMM, volume = "20", number = "10", pages = "327:1--327:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680285", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3680285", abstract = "Food image recognition has recently been given considerable attention in the multimedia field in light of its possible implications on health. The characteristics of the dispersed distribution of ingredients in food images put forward higher requirements \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "327", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:MMR, author = "Huaijin Liu and Jixiang Du and Yong Zhang and Hongbo Zhang and Jiandian Zeng", title = "{MSSA}: Multi-Representation Semantics-Augmented Set Abstraction for {$3$D} Object Detection", journal = j-TOMM, volume = "20", number = "10", pages = "328:1--328:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3686157", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3686157", abstract = "Accurate recognition and localization of 3D objects is a fundamental research problem in 3D computer vision. Benefiting from transformation-free point cloud processing and flexible receptive fields, point-based methods have become accurate in 3D point \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "328", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kawai:2024:RBH, author = "Vinicius Sato Kawai and Lucas Pascotti Valem and Alexandro Baldassin and Edson Borin and Daniel Carlos Guimar{\~a}es Pedronette and Longin Jan Latecki", title = "Rank-based Hashing for Effective and Efficient Nearest Neighbor Search for Image Retrieval", journal = j-TOMM, volume = "20", number = "10", pages = "329:1--329:??", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659580", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 31 10:45:31 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3659580", abstract = "The large and growing amount of digital data creates a pressing need for approaches capable of indexing and retrieving multimedia content. A traditional and fundamental challenge consists of effectively and efficiently performing nearest-neighbor searches. After decades of research, several different methods are available, including trees, hashing, and graph-based approaches. Most of the current methods exploit learning to hash approaches based on deep learning. In spite of effective results and compact codes obtained, such methods often require a significant amount of labeled data for training. Unsupervised approaches also rely on expensive training procedures usually based on a huge amount of data. In this work, we propose an unsupervised data-independent approach for nearest neighbor searches, which can be used with different features, including deep features trained by transfer learning. The method uses a rank-based formulation and exploits a hashing approach for efficient ranked list computation at query time. A comprehensive experimental evaluation was conducted on seven public datasets, considering deep features based on CNNs and Transformers. Both effectiveness and efficiency aspects were evaluated. The proposed approach achieves remarkable results in comparison to traditional and state-of-the-art methods. Hence, it is an attractive and innovative solution, especially when costly training procedures need to be avoided.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "329", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Vyas:2024:ISI, author = "Ritesh Vyas and Michele Nappi and Alberto {Del Bimbo} and Sambit Bakshi", title = "Introduction to Special Issue on {``Recent Trends in Multimedia Forensics''}", journal = j-TOMM, volume = "20", number = "11", pages = "330:1--330:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678473", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3678473", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "330", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Carletti:2024:FSB, author = "Vincenzo Carletti and Pasquale Foggia and Antonio Greco and Alessia Saggese and Mario Vento", title = "Facial Soft-biometrics Obfuscation through Adversarial Attacks", journal = j-TOMM, volume = "20", number = "11", pages = "331:1--331:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656474", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3656474", abstract = "Sharing facial pictures through online services, especially on social networks, has become a common habit for thousands of users. This practice hides a possible threat to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "331", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:MTA, author = "Hanrui Wang and Shuo Wang and Cunjian Chen and Massimo Tistarelli and Zhe Jin", title = "A Multi-Task Adversarial Attack against Face Authentication", journal = j-TOMM, volume = "20", number = "11", pages = "332:1--332:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665496", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665496", abstract = "Deep learning-based identity management systems, such as face authentication systems, are vulnerable to adversarial attacks. However, existing attacks are typically designed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "332", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2024:SMG, author = "Tian Wu and Rongbo Zhu and Shaohua Wan", title = "Semantic Map Guided Identity Transfer {GAN} for Person Re-identification", journal = j-TOMM, volume = "20", number = "11", pages = "333:1--333:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631355", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3631355", abstract = "Generative adversarial networks (GANs)-based person re-identification (re-id) schemes provide potential ways to augment data in practical applications. However, existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "333", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mahto:2024:RCP, author = "D. K. Mahto and A. K. Singh and K. N. Singh and O. P. Singh and A. K. Agrawal", title = "Robust Copyright Protection Technique with High-embedding Capacity for Color Images", journal = j-TOMM, volume = "20", number = "11", pages = "334:1--334:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3580502", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3580502", abstract = "Copyright violation issues have a growing impact on applications of the digital era, especially images. It is not easy to guarantee the copyright protection of essential information. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "334", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{S:2024:ISM, author = "Shitharth S. and Hariprasath Manoharan and Alaa O. Khadidos and Achyut Shankar and Carsten Maple and Adil O. Khadidos and Shahid Mumtaz", title = "Improved Security for Multimedia Data Visualization using Hierarchical Clustering Algorithm", journal = j-TOMM, volume = "20", number = "11", pages = "335:1--335:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3610296", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3610296", abstract = "In this paper, a realization technique is designed with a unique analytical model for transmitting multimedia data to appropriate end users. Transmission of multimedia data to all \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "335", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2024:GIS, author = "Youqiang Sun and Jianyi Liu and Ru Zhang", title = "Generative Image Steganography Based on Guidance Feature Distribution", journal = j-TOMM, volume = "20", number = "11", pages = "336:1--336:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625297", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625297", abstract = "Without modification, generative steganography is more secure than modification-based steganography. However, existing generative steganography methods still have \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "336", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Neekhara:2024:FSF, author = "Paarth Neekhara and Shehzeen Hussain and Xinqiao Zhang and Ke Huang and Julian McAuley and Farinaz Koushanfar", title = "{FaceSigns}: Semi-fragile Watermarks for Media Authentication", journal = j-TOMM, volume = "20", number = "11", pages = "337:1--337:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640466", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3640466", abstract = "Manipulated media is becoming a prominent threat due to the recent advances in realistic image and video synthesis techniques. There have been several attempts at \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "337", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2024:BTS, author = "Jing Zhao and Hongwei Yang and Hui He and Jie Peng and Weizhe Zhang and Jiangqun Ni and Arun Kumar Sangaiah and Aniello Castiglione", title = "Backdoor Two-Stream Video Models on Federated Learning", journal = j-TOMM, volume = "20", number = "11", pages = "338:1--338:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3651307", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3651307", abstract = "Video models on federated learning (FL) enable continual learning of the involved models for video tasks on end-user devices while protecting the privacy of end-user data. As a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "338", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Iqbal:2024:DAB, author = "Farkhund Iqbal and Ahmed Abbasi and Abdul Rehman Javed and Ahmad Almadhor and Zunera Jalil and Sajid Anwar and Imad Rida", title = "Data Augmentation-based Novel Deep Learning Method for Deepfaked Images Detection", journal = j-TOMM, volume = "20", number = "11", pages = "339:1--339:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3592615", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3592615", abstract = "Recent advances in artificial intelligence have led to deepfake images, enabling users to replace a real face with a genuine one. deepfake images have recently been used to malign \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "339", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2024:DDV, author = "Kaihan Lin and Weihong Han and Shudong Li and Zhaoquan Gu and Huimin Zhao and Yangyang Mei", title = "Detecting Deepfake Videos using Spatiotemporal {Trident} Network", journal = j-TOMM, volume = "20", number = "11", pages = "340:1--340:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3623639", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3623639", abstract = "The widespread dissemination of Deepfake in social networks has posed serious security risks, thus necessitating the development of an effective Deepfake detection \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "340", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{UlHaq:2024:MNA, author = "Ijaz {Ul Haq} and Khalid Mahmood Malik and Khan Muhammad", title = "Multimodal Neurosymbolic Approach for Explainable Deepfake Detection", journal = j-TOMM, volume = "20", number = "11", pages = "341:1--341:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3624748", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3624748", abstract = "Deepfake detection has become increasingly important in recent years owing to the widespread availability of deepfake generation technologies. Existing deepfake \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "341", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Becattini:2024:HPE, author = "Federico Becattini and Carmen Bisogni and Vincenzo Loia and Chiara Pero and Fei Hao", title = "Head Pose Estimation Patterns as Deepfake Detectors", journal = j-TOMM, volume = "20", number = "11", pages = "342:1--342:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3612928", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3612928", abstract = "The capacity to create ``fake'' videos has recently raised concerns about the reliability of multimedia content. Identifying between true and false information is a critical step \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "342", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guarnera:2024:MDD, author = "Luca Guarnera and Oliver Giudice and Sebastiano Battiato", title = "Mastering Deepfake Detection: a Cutting-edge Approach to Distinguish {GAN} and Diffusion-model Images", journal = j-TOMM, volume = "20", number = "11", pages = "343:1--343:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652027", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3652027", abstract = "Detecting and recognizing deepfakes is a pressing issue in the digital age. In this study, we first collected a dataset of pristine images and fake ones properly generated by nine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "343", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nadimpalli:2024:PDD, author = "Aakash Varma Nadimpalli and Ajita Rattani", title = "{ProActive DeepFake} Detection using {GAN}-based Visible Watermarking", journal = j-TOMM, volume = "20", number = "11", pages = "344:1--344:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3625547", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3625547", abstract = "With the advances in generative adversarial networks (GAN), facial manipulations called DeepFakes have caused major security risks and raised severe societal concerns. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "344", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kaddar:2024:DDU, author = "Bachir Kaddar and Sid Ahmed Fezza and Zahid Akhtar and Wassim Hamidouche and Abdenour Hadid and Joan Serra-Sagrist{\'a}", title = "Deepfake Detection Using Spatiotemporal Transformer", journal = j-TOMM, volume = "20", number = "11", pages = "345:1--345:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3643030", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3643030", abstract = "Recent advances in generative models and the availability of large-scale benchmarks have made deepfake video generation and manipulation easier. Nowadays, the number of new \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "345", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2024:FDW, author = "Shuai Xiao and Zhuo Zhang and Jiachen Yang and Jiabao Wen and Yang Li", title = "Forgery Detection by Weighted Complementarity between Significant Invariance and Detail Enhancement", journal = j-TOMM, volume = "20", number = "11", pages = "346:1--346:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3605893", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3605893", abstract = "Generative adversarial networks have shown impressive results in the modeling of movies and games, but what if such powerful image generation capability is used to harm \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "346", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Capasso:2024:CSM, author = "Paola Capasso and Giuseppe Cattaneo and Maria {De Marsico}", title = "A Comprehensive Survey on Methods for Image Integrity", journal = j-TOMM, volume = "20", number = "11", pages = "347:1--347:??", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633203", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:35 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3633203", abstract = "The outbreak of digital devices on the Internet, the exponential diffusion of data (images, video, audio, and text), along with their manipulation/generation also by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "347", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:HPV, author = "Hongbin Wang and Rui Tang and Fan Li", title = "Hypercube Pooling for Visual Semantic Embedding", journal = j-TOMM, volume = "20", number = "12", pages = "363:1--363:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689637", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3689637", abstract = "Visual Semantic Embedding (VSE) is a primary model for cross-modal retrieval, wherein the global feature aggregator is a crucial component of the VSE model. In recent research, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "363", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:CLK, author = "Fei Wang and Liang Ding and Jun Rao and Ye Liu and Li Shen and Changxing Ding", title = "Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language Pretraining?", journal = j-TOMM, volume = "20", number = "12", pages = "364:1--364:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690640", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3690640", abstract = "The field of multimedia research has witnessed significant interest in leveraging multimodal pretrained neural network models to perceive and represent the physical world. Among \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "364", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:DDE, author = "Caixia Liu and Yali Chen and Minhong Zhu and Chenhui Hao and Haisheng Li and Xiaochuan Wang", title = "{DEGAN}: Detail-Enhanced Generative Adversarial Network for Monocular Depth-Based {$3$D} Reconstruction", journal = j-TOMM, volume = "20", number = "12", pages = "365:1--365:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690826", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3690826", abstract = "Although deep networks-based 3D reconstruction methods can recover the 3D geometry given few inputs, they may produce unfaithful reconstruction when \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "365", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2024:CMC, author = "Dan Song and Shumeng Huo and Xinwei Fu and Chumeng Zhang and Wenhui Li and An-An Liu", title = "Cross-Modal Contrastive Learning with a Style-Mixed Bridge for Single Image {$3$D} Shape Retrieval", journal = j-TOMM, volume = "20", number = "12", pages = "366:1--366:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689645", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3689645", abstract = "Image-based 3D shape retrieval (IBSR) is a cross-modal matching task which searches similar shapes from a 3D repository using a natural image. Continuous attention has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "366", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2024:UAV, author = "Ting-Lan Lin and Bing-Wei Su and Po-Cheng Shen and Ding-Yuan Chen and Chi-Fu Liang and Yan-Cheng Chen and Yangming Wen and Mohammad Shahid", title = "Upsampling Algorithm for {V-PCC-Coded} {$3$D} Point Clouds", journal = j-TOMM, volume = "20", number = "12", pages = "367:1--367:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690641", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3690641", abstract = "Point cloud (PC) compression is crucial to immersive visual applications such as autonomous vehicles to classify objects on the roads. The Motion Picture Experts \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "367", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2024:ETC, author = "Yuanzhi Wang and Yong Li and Xiaoya Zhang and Xin Liu and Anbo Dai and Antoni B. Chan and Zhen Cui", title = "Edit Temporal-Consistent Videos with Image Diffusion Model", journal = j-TOMM, volume = "20", number = "12", pages = "368:1--368:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3691344", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3691344", abstract = "Large-scale text-to-image (T2I) diffusion models have been extended for text-guided video editing, yielding impressive zero-shot video editing performance. Nonetheless, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "368", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alvarez:2024:GES, author = "Luis Alvarez and Agust{\'\i}n Trujillo and Nelson Monz{\'o}n and Jean-Michel Morel", title = "Generation and Editing of {$2$D} Shapes Using a Branched Representation", journal = j-TOMM, volume = "20", number = "12", pages = "369:1--369:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3691635", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3691635", abstract = "In this article, we propose a new planar shape representation, the medial branch graph representation (MBGR) which allows to easily generate, vary and edit all-new sorts \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "369", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:FUC, author = "Xingbo Liu and Jiamin Li and Xiushan Nie and Xuening Zhang and Yilong Yin", title = "Fast Unsupervised Cross-Modal Hashing with Robust Factorization and Dual Projection", journal = j-TOMM, volume = "20", number = "12", pages = "370:1--370:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694684", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694684", abstract = "Unsupervised hashing has attracted extensive attention in effectively and efficiently tackling large-scale cross-modal retrieval task. Existing methods typically try to mine the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "370", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2024:RWS, author = "Yongheng Zhang and Yuanqiang Cai and Danfeng Yan and Rongheng Lin", title = "Real-World Scene Image Enhancement with Contrastive Domain Adaptation Learning", journal = j-TOMM, volume = "20", number = "12", pages = "371:1--371:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694973", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694973", abstract = "Image enhancement methods leveraging learning-based approaches have demonstrated impressive results when trained on synthetic degraded-clear image pairs. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "371", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2024:RDH, author = "Chunqiang Yu and Shichao Cheng and Xianquan Zhang and Xinpeng Zhang and Zhenjun Tang", title = "Reversible Data Hiding in Shared {JPEG} Images", journal = j-TOMM, volume = "20", number = "12", pages = "372:1--372:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3695463", abstract = "Reversible data hiding (RDH) in encrypted images has emerged as an effective technique for securely storing and managing confidential images in the cloud. However, most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "372", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:TDC, author = "Boqian Liu and Haojie Li and Zhihui Wang and Tianfan Xue", title = "Transparent Depth Completion Using Segmentation Features", journal = j-TOMM, volume = "20", number = "12", pages = "373:1--373:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694978", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694978", abstract = "Estimating the depth of transparent objects is one of the well-known challenges of RGB-D cameras due to the reflection and refraction effects. Previously, researchers \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "373", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bao:2024:CLP, author = "Yongtang Bao and Chunjian Su and Yutong Qi and Yanbing Geng and Haojie Li", title = "Category-Level Pose Estimation and Iterative Refinement for Monocular {RGB-D} Image", journal = j-TOMM, volume = "20", number = "12", pages = "374:1--374:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695877", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3695877", abstract = "Category-level pose estimation is proposed to predict the 6D pose of objects under a specific category and has wide applications in fields such as robotics, virtual reality, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "374", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2024:MMD, author = "Kuiyuan Sun and Xiaolong Liu and Xiaolong Li and Yao Zhao and Wei Wang", title = "Multi-Modal Driven Pose-Controllable Talking Head Generation", journal = j-TOMM, volume = "20", number = "12", pages = "375:1--375:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673901", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3673901", abstract = "Talking head, driving a source image to generate a talking video using other modality information, has made great progress in recent years. However, there are two main issues: (1) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "375", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:DIC, author = "Bing Liu and Jinfu Lu and Mingming Liu and Hao Liu and Yong Zhou and Dongping Yang", title = "Diverse Image Captioning via Panoptic Segmentation and Sequential Conditional Variational Transformer", journal = j-TOMM, volume = "20", number = "12", pages = "376:1--376:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695878", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3695878", abstract = "Recently, transformer-based image captioning models have achieved significant performance improvement. However, due to the limitations of region visual features and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "376", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Stephanie:2024:WBP, author = "Veronika Stephanie and Ibrahim Khalil and Mohammed Atiquzzaman", title = "Weight-Based Privacy-Preserving Asynchronous {SplitFed} for Multimedia Healthcare Data", journal = j-TOMM, volume = "20", number = "12", pages = "377:1--377:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695876", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3695876", abstract = "Multimedia significantly enhances modern healthcare by facilitating the analysis and sharing of diverse data, including medical images, videos, and sensor data. Integrating AI \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "377", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:ASS, author = "Chuanhao Li and Chenchen Jing and Zhen Li and Yuwei Wu and Yunde Jia", title = "Adversarial Sample Synthesis for Visual Question Answering", journal = j-TOMM, volume = "20", number = "12", pages = "378:1--378:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688848", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3688848", abstract = "Language prior is a major block to improving the generalization of visual question answering (VQA) models. Recent work has revealed that synthesizing extra training samples to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "378", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2024:IST, author = "Shipeng Zhu and Jun Fang and Pengfei Fang and Hui Xue", title = "Improving Scene Text Retrieval via Stylized Middle Modality", journal = j-TOMM, volume = "20", number = "12", pages = "379:1--379:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696209", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3696209", abstract = "Scene text retrieval addresses the challenge of localizing and searching for all text instances within scene images based on a query text. This cross-modal task has significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "379", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liang:2024:CCM, author = "Xiao Liang and Erkun Yang and Cheng Deng and Yanhua Yang", title = "{CrossFormer}: Cross-Modal Representation Learning via Heterogeneous Graph Transformer", journal = j-TOMM, volume = "20", number = "12", pages = "380:1--380:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688801", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3688801", abstract = "Transformers have been recognized as powerful tools for various cross-modal tasks due to their superior ability to perform representation learning through self-attention. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "380", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2024:TTS, author = "Jiayu Lin and Yuan-Gen Wang", title = "{TSFormer}: Tracking Structure Transformer for Image Inpainting", journal = j-TOMM, volume = "20", number = "12", pages = "381:1--381:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696452", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3696452", abstract = "Recent studies have shown that image structure can significantly facilitate image inpainting. However, current approaches mostly explore structure prior without considering its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "381", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:DST, author = "Yixuan Li and Peilin Chen and Hanwei Zhu and Keyan Ding and Leida Li and Shiqi Wang", title = "Deep Shape-Texture Statistics for Completely Blind Image Quality Evaluation", journal = j-TOMM, volume = "20", number = "12", pages = "382:1--382:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694977", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694977", abstract = "Opinion-Unaware Blind Image Quality Assessment (OU-BIQA) models aim to predict image quality without training on reference images and subjective quality scores. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "382", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2024:PEP, author = "Zhenyu Zhou and Qing Liao and Lei Luo and Xinwang Liu and En Zhu", title = "{ProtoRefine}: Enhancing Prototypes with Similar Structure in Few-Shot Learning", journal = j-TOMM, volume = "20", number = "12", pages = "383:1--383:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694686", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694686", abstract = "Few-shot learning presents a substantial challenge in developing robust models due to the inherent scarcity of samples within each category. To overcome this challenge, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "383", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2024:RHD, author = "Mengzhu Yu and Zhenjun Tang and Xiaoping Liang and Xianquan Zhang and Zhixin Li and Xinpeng Zhang", title = "Robust Hashing with Deep Features and {Meixner} Moments for Image Copy Detection", journal = j-TOMM, volume = "20", number = "12", pages = "384:1--384:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696669", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3696669", abstract = "Copy detection is a key task of image copyright protection. Most robust hashing schemes do not make satisfied performance of image copy detection yet. To address this, a robust \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "384", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2024:PFM, author = "Jiabei Liu and Weiming Zhuang and Yuanyuan Liu and Yonggang Wen and Jun Huang and Wei Lin", title = "Personalized Federated Mutual Learning for Unsupervised Camera-Aware Person Re-Identification", journal = j-TOMM, volume = "20", number = "12", pages = "385:1--385:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696453", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3696453", abstract = "Person re-identification (ReID) is essential for enhancing security and tracking in multi-camera surveillance systems. To achieve effective ReID performance across diverse datasets, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "385", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2024:PBM, author = "Yiyang Ma and Haowei Kuang and Huan Yang and Jianlong Fu and Jiaying Liu", title = "Prompt-Based Modality Bridging for Unified Text-to-Face Generation and Manipulation", journal = j-TOMM, volume = "20", number = "12", pages = "386:1--386:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694974", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694974", abstract = "Text-driven face image generation and manipulation are significant tasks. However, such tasks are quite challenging due to the gap between text and image modalities. It \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "386", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2024:OMG, author = "Peilin Chen and Shiqi Wang and Zhu Li", title = "Occupancy Map Guided Attributes Artifacts Removal for Video-Based Point Cloud Compression", journal = j-TOMM, volume = "20", number = "12", pages = "387:1--387:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3697351", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3697351", abstract = "Point clouds offer realistic 3D representations of objects and scenes at the expense of large data volumes. To represent such data compactly in real-world applications, Video-Based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "387", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2024:IRL, author = "Yunda Sun and Lin Zhang and Zhong Wang and Yang Chen and Shengjie Zhao and Yicong Zhou", title = "{I2P} Registration by Learning the Underlying Alignment Feature Space from Pixel-to-Point Similarities", journal = j-TOMM, volume = "20", number = "12", pages = "388:1--388:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3697839", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3697839", abstract = "Estimating the relative pose between a camera and a LiDAR holds paramount importance in facilitating complex task execution within multi-agent systems. Nonetheless, current \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "388", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gebre:2024:ETS, author = "Daniel Gebre and Siem Hadish and Aron Sbhatu and Moayad Aloqaily and Mohsen Guizani", title = "Establishing Trust and Security in Decentralized Metaverse: a {Web 3.0} Approach", journal = j-TOMM, volume = "20", number = "12", pages = "389:1--389:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696454", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3696454", abstract = "The integration of blockchain and Web 3.0 technologies offers significant advancements in identity management and trust within decentralized Metaverse environments. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "389", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mao:2024:IRB, author = "Yangjun Mao and Jun Xiao and Dong Zhang and Meng Cao and Jian Shao and Yueting Zhuang and Long Chen", title = "Improving Reference-Based Distinctive Image Captioning with Contrastive Rewards", journal = j-TOMM, volume = "20", number = "12", pages = "390:1--390:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3694683", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694683", abstract = "Distinctive Image Captioning (DIC)-generating distinctive captions that describe the unique details of a target image-has received considerable attention over the last \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "390", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:MAS, author = "Shenglan Li and Rui Yao and Yong Zhou and Hancheng Zhu and Jiaqi Zhao and Zhiwen Shao and Abdulmotaleb {El Saddik}", title = "Motion-Aware Self-Supervised {RGBT} Tracking with Multi-Modality Hierarchical Transformers", journal = j-TOMM, volume = "20", number = "12", pages = "391:1--391:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3698399", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3698399", abstract = "Supervised RGBT (SRGBT) tracking tasks need both expensive and time-consuming annotations. Therefore, the implementation of Self-Supervised RGBT (SSRGBT) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "391", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ling:2024:VLD, author = "Jun Ling and Han Xue and Anni Tang and Rong Xie and Li Song", title = "{ViCoFace}: Learning Disentangled Latent Motion Representations for Visual-Consistent Face Reenactment", journal = j-TOMM, volume = "20", number = "12", pages = "392:1--392:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3698769", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3698769", abstract = "Unsupervised face reenactment aims to animate a source image to imitate the motions of a target image while retaining the source portrait's attributes like facial geometry, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "392", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2024:MGH, author = "Jiachen Li and Qing Xie and Xiaojun Chang and Jinyu Xu and Yongjian Liu", title = "Mutually-Guided Hierarchical Multi-Modal Feature Learning for Referring Image Segmentation", journal = j-TOMM, volume = "20", number = "12", pages = "393:1--393:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3698771", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3698771", abstract = "Referring image segmentation aims to locate and segment the target region based on a given textual expression query. The primary challenge is to understand semantics from visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "393", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Alshehri:2024:ISS, author = "Fatima Alshehri and Ghulam Muhammad", title = "Ischemic Stroke Segmentation by Transformer and Convolutional Neural Network Using Few-Shot Learning", journal = j-TOMM, volume = "20", number = "12", pages = "394:1--394:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3699513", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Dec 11 10:08:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3699513", abstract = "Stroke is a major factor in causing disability and fatalities. Doctors use computerized tomography (CT) and magnetic resonance imaging (MRI) scans to assess the severity of a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "394", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ionescu:2025:ISI, author = "Bogdan Ionescu and Ioannis Patras and Henning M{\"u}ller and Alberto {Del Bimbo}", title = "Introduction to the Special Issue on Realistic Synthetic Data: Generation, Learning, Evaluation", journal = j-TOMM, volume = "21", number = "1", pages = "1:1--1:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703593", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3703593", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Westerski:2025:SDO, author = "Adam Westerski and Wee Teck Fong", title = "Synthetic Data for Object Detection with Neural Networks: State-of-the-Art Survey of Domain Randomisation Techniques", journal = j-TOMM, volume = "21", number = "1", pages = "2:1--2:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3637064", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3637064", abstract = "Machine learning relies heavily on access to large and well-maintained datasets. In this article, we focus on Computer Vision and object detection applications to survey past research on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Vaz:2025:GPS, author = "Bruno Vaz and {\'A}lvaro Figueira", title = "{GANs} in the Panorama of Synthetic Data Generation Methods", journal = j-TOMM, volume = "21", number = "1", pages = "3:1--3:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3657294", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3657294", abstract = "This article focuses on the creation and evaluation of synthetic data to address the challenges of imbalanced datasets in machine learning (ML) applications, using fake \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Idris:2025:SIT, author = "Azeez Idris and Mohammed Khaleel and Wallapak Tavanapong and Piet C. {De Groen}", title = "Synthesized Image Training Techniques: On Improving Model Performance Using Confusion", journal = j-TOMM, volume = "21", number = "1", pages = "4:1--4:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3641856", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3641856", abstract = "The performance of supervised deep learning image classifiers has significantly improved with large, labeled datasets and increased computing power. However, obtaining large, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2025:GAR, author = "Wenmiao Hu and Yifang Yin and Ying Kiat Tan and An Tran and Hannes Kruppa and Roger Zimmermann", title = "{GAN}-Assisted Road Segmentation from Satellite Imagery", journal = j-TOMM, volume = "21", number = "1", pages = "5:1--5:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3635153", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635153", abstract = "Geo-information extraction from satellite imagery has become crucial to carry out large-scale ground surveys in a short amount of time. With the increasing number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hellmann:2025:GGB, author = "Fabio Hellmann and Silvan Mertes and Mohamed Benouis and Alexander Hustinx and Tzung-Chien Hsieh and Cristina Conati and Peter Krawitz and Elisabeth Andr{\'e}", title = "{GANonymization}: a {GAN}-Based Face Anonymization Framework for Preserving Emotional Expressions", journal = j-TOMM, volume = "21", number = "1", pages = "6:1--6:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3641107", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3641107", abstract = "In recent years, the increasing availability of personal data has raised concerns regarding privacy and security. One of the critical processes to address these concerns is data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zou:2025:FED, author = "Kaifeng Zou and Sylvain Faisan and Boyang Yu and Sebastien Valette and Hyewon Seo", title = "{$4$D} Facial Expression Diffusion Model", journal = j-TOMM, volume = "21", number = "1", pages = "7:1--7:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3653455", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3653455", abstract = "Facial expression generation is one of the most challenging and long-sought aspects of character animation, with many interesting applications. The challenging task, traditionally \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{T:2025:TGS, author = "Anjali T. and {Masilamani V}", title = "Text-Guided Synthesis of Masked Face Images", journal = j-TOMM, volume = "21", number = "1", pages = "8:1--8:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3654667", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3654667", abstract = "The COVID-19 pandemic has made us all understand that wearing a face mask protects us from the spread of respiratory viruses. Face authentication systems, which are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:DRG, author = "Xin Huang and Dong Liang and Hongrui Cai and Yunfeng Bai and Juyong Zhang and Feng Tian and Jinyuan Jia", title = "Double Reference Guided Interactive {$2$D} and {$3$D} Caricature Generation", journal = j-TOMM, volume = "21", number = "1", pages = "9:1--9:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3655624", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3655624", abstract = "In this article, we propose the first geometry and texture (double) referenced interactive two-dimensional (2D) and 3D caricature generating and editing method. The main \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Desai:2025:RRS, author = "Chaitra Desai and Sujay Benur and Ujwala Patil and Uma Mudenagudi", title = "{RSUIGM}: Realistic Synthetic Underwater Image Generation with Image Formation Model", journal = j-TOMM, volume = "21", number = "1", pages = "10:1--10:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3656473", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3656473", abstract = "In this article, we propose to synthesize realistic underwater images with a novel image formation model, considering both downwelling depth and line of sight (LOS) distance as cue and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Amoroso:2025:PCD, author = "Roberto Amoroso and Davide Morelli and Marcella Cornia and Lorenzo Baraldi and Alberto {Del Bimbo} and Rita Cucchiara", title = "Parents and Children: Distinguishing Multimodal Deepfakes from Natural Images", journal = j-TOMM, volume = "21", number = "1", pages = "11:1--11:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3665497", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665497", abstract = "Recent advancements in diffusion models have enabled the generation of realistic deepfakes from textual prompts in natural language. While these models have numerous benefits \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Celard:2025:NMD, author = "Pedro Celard and Eva Lorenzo Iglesias and Jose Manuel Sorribes-Fern{\'a}ndez and Lourdes Borrajo and Adri{\'a}n Seara Vieira", title = "New Metrics and Dataset for Biological Development Video Generation", journal = j-TOMM, volume = "21", number = "1", pages = "12:1--12:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3653456", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3653456", abstract = "Image generative models have advanced in many areas to produce synthetic images of high resolution and detail. This success has enabled its use in the biomedical field, paving the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gramoli:2025:GED, author = "Lysa Gramoli and Julien Cumin and J{\'e}r{\'e}my Lacoche and Anthony Foulonneau and Bruno Arnaldi and Val{\'e}rie Gouranton", title = "Generating and Evaluating Data of Daily Activities with an Autonomous Agent in a Virtual Smart Home", journal = j-TOMM, volume = "21", number = "1", pages = "13:1--13:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3665331", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3665331", abstract = "Training machine learning models to identify human behavior is a difficult yet essential task to develop autonomous and adaptive systems such as smart homes. These models require \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Airale:2025:AGS, author = "Louis Airale and Xavier Alameda-Pineda and St{\'e}phane Lathuili{\`e}re and Dominique Vaufreydaz", title = "Autoregressive {GAN} for Semantic Unconditional Head Motion Generation", journal = j-TOMM, volume = "21", number = "1", pages = "14:1--14:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3635154", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3635154", abstract = "In this work, we address the task of unconditional head motion generation to animate still human faces in a low-dimensional semantic space from a single reference pose. Different from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hodzic:2025:DFC, author = "Kerim Hod{\v{z}}i{\'c} and Mirsad Cosovic and Sasa Mrdovic and Jason J. Quinlan and Darijo Raca", title = "{DashReStreamer}: Framework for Creation of Impaired Video Clips under Realistic Network Conditions", journal = j-TOMM, volume = "21", number = "1", pages = "15:1--15:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3640016", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3640016", abstract = "The continuous rise of multimedia entertainment has led to an increased demand for delivering outstanding user experience of multimedia content. However, modeling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Constantin:2025:EGA, author = "Mihai Gabriel Constantin and Dan-Cristian Stanciu and Liviu-Daniel {\c{S}}tefan and Mihai Dogariu and Dan Mih{\u{a}}ilescu and George Ciobanu and Matt Bergeron and Winston Liu and Konstantin Belov and Octavian Radu and Bogdan Ionescu", title = "Exploring Generative Adversarial Networks for Augmenting Network Intrusion Detection Tasks", journal = j-TOMM, volume = "21", number = "1", pages = "16:1--16:??", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689636", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Jan 21 06:46:21 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3689636", abstract = "The advent of generative networks and their adoption in numerous domains and communities have led to a wave of innovation and breakthroughs in AI and machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:ISI, author = "Yushu Zhang and William Puech and Anderson Rocha and Rongxing Lu and Stefano Cresci and Roberto Di Pietro", title = "Introduction to the Special Issue on Security and Privacy of Avatar in Metaverse", journal = j-TOMM, volume = "21", number = "2", pages = "41:1--41:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702485", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3702485", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "41", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:SDC, author = "Fan Wang and Zhangjie Fu and Xiang Zhang", title = "A Self-Defense Copyright Protection Scheme for {NFT} Image Art Based on Information Embedding", journal = j-TOMM, volume = "21", number = "2", pages = "42:1--42:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3652519", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3652519", abstract = "Non-convertible tokens (NFTs) have become a fundamental part of the metaverse ecosystem due to its uniqueness and immutability. However, existing copyright protection schemes \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "42", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:IAW, author = "Jinwei Wang and Haihua Wang and Jiawei Zhang and Hao Wu and Xiangyang Luo and Bin Ma", title = "Invisible Adversarial Watermarking: a Novel Security Mechanism for Enhancing Copyright Protection", journal = j-TOMM, volume = "21", number = "2", pages = "43:1--43:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3652608", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3652608", abstract = "Invisible watermarking can be used as an important tool for copyright certification in the Metaverse. However, with the advent of deep learning, Deep Neural Networks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "43", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhai:2025:FCP, author = "Rui Zhai and Rongrong Ni and Yang Yu and Yao Zhao", title = "{FaceDefend}: Copyright Protection to Prevent Face Embezzle", journal = j-TOMM, volume = "21", number = "2", pages = "44:1--44:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3699718", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3699718", abstract = "With the rapid evolution of deep learning and the advent of AI, the metaverse has emerged as a significant technology. Within the metaverse, diverse elements such as rich \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "44", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2025:AVC, author = "Hanqing Zhao and Wenbo Zhou and Dongdong Chen and Weiming Zhang and Ying Guo and Zhen Cheng and Pengfei Yan and Nenghai Yu", title = "Audio-Visual Contrastive Pre-train for Face Forgery Detection", journal = j-TOMM, volume = "21", number = "2", pages = "45:1--45:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3651311", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3651311", abstract = "The highly realistic avatar in the metaverse may lead to deepfakes of facial identity. Malicious users can more easily obtain the three-dimensional structure of faces, thus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "45", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2025:FEM, author = "Long Tang and Dengpan Ye and Zhenhao Lu and Yunming Zhang and Chuanxi Chen", title = "Feature Extraction Matters More: an Effective and Efficient Universal Deepfake Disruptor", journal = j-TOMM, volume = "21", number = "2", pages = "46:1--46:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3653457", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3653457", abstract = "Face manipulation can modify a victim's facial attributes (e.g., age or hair color) in an image, which is an important component of deepfakes. Adversarial examples are an emerging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "46", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:DIP, author = "Jian Zhang and Jiangqun Ni and Fan Nie and Jiwu Huang", title = "Domain-invariant and Patch-discriminative Feature Learning for General Deepfake Detection", journal = j-TOMM, volume = "21", number = "2", pages = "47:1--47:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3657297", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3657297", abstract = "Hyper-realistic avatars in the metaverse have already raised security concerns about deepfake techniques; deepfakes involving generated video ``recording'' may be mistaken for a real \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "47", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:SIL, author = "Dengyong Zhang and Wenjie Zhu and Xin Liao and Feifan Qi and Gaobo Yang and Xiangling Ding", title = "Spatiotemporal Inconsistency Learning and Interactive Fusion for Deepfake Video Detection", journal = j-TOMM, volume = "21", number = "2", pages = "48:1--48:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3664654", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3664654", abstract = "With the rise of the metaverse, the rapid advancement of Deepfakes technology has become closely intertwined. Within the metaverse, individuals exist in digital \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "48", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:DVD, author = "Rui Yang and Rushi Lan and Zhenrong Deng and Xiaonan Luo and Xiyan Sun", title = "Deepfake Video Detection Using Facial Feature Points and Ch-Transformer", journal = j-TOMM, volume = "21", number = "2", pages = "49:1--49:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3672566", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3672566", abstract = "With the development of Metaverse technology, the avatar in Metaverse has faced serious security and privacy concerns. Analyzing facial features to distinguish between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "49", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2025:QAO, author = "Jianheng Tang and Kejia Fan and Wenjie Yin and Shihao Yang and Yajiang Huang and Anfeng Liu and Naixue Xiong and Mianxiong Dong and Tian Wang and Shaobo Zhang", title = "A Quality-Aware and Obfuscation-Based Data Collection Scheme for Cyber-Physical Metaverse Systems", journal = j-TOMM, volume = "21", number = "2", pages = "50:1--50:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3659582", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3659582", abstract = "In pursuit of an immersive virtual experience within the Cyber-Physical Metaverse Systems (CPMS), the construction of Avatars often requires a significant amount of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "50", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Han:2025:EBF, author = "Xiaoxuan Han and Songlin Yang and Wei Wang and Ziwen He and Jing Dong", title = "Exploiting Backdoors of Face Synthesis Detection with Natural Triggers", journal = j-TOMM, volume = "21", number = "2", pages = "51:1--51:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3677380", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3677380", abstract = "Deep neural networks have enhanced face synthesis detection in discriminating Artificial Intelligence Generated Content (AIGC). However, their security is threatened by the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "51", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zeng:2025:NTS, author = "Jiuzhen Zeng and Laurence T. Yang and Chao Wang and Junjie Su and Xianjun Deng", title = "A New Tensor Summary Statistic for Real-Time Detection of Stealthy Anomaly in Avatar Interaction", journal = j-TOMM, volume = "21", number = "2", pages = "52:1--52:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689429", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3689429", abstract = "Avatar is one of the most intuitive central components in Metaverse and faces serious security problems, particularly during the interaction with each other. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "52", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sha:2025:VDB, author = "Letian Sha and Xiao Chen and Fu Xiao and Zhong Wang and Zhangbo Long and Qianyu Fan and Jiankuo Dong", title = "{VRVul}-Discovery: {BiLSTM}-based Vulnerability Discovery for Virtual Reality Devices in Metaverse", journal = j-TOMM, volume = "21", number = "2", pages = "53:1--53:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3677609", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3677609", abstract = "The rapid development of the metaverse has brought about numerous security challenges. Virtual Reality (VR), as one of the core technologies, plays a crucial role in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "53", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiao:2025:PPI, author = "Gui Xiao and Zhen Ling and Qunqun Fan and Xiangyu Xu and Wenjia Wu and Ding Ding and Chen Chen and Xinwen Fu", title = "{Pivot}: Panoramic-Image-Based {VR} User Authentication against Side-Channel Attacks", journal = j-TOMM, volume = "21", number = "2", pages = "54:1--54:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3694975", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3694975", abstract = "With metaverse attracting increasing attention from both academic and industry, the application of virtual reality (VR) has extended beyond 3D immersive viewing/gaming to a broader \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "54", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2025:CAB, author = "Yalin Song and Wenbin Jiang and Xiuli Chai and Zhihua Gan and Mengyuan Zhou and Lei Chen", title = "Cross-Attention Based Two-Branch Networks for Document Image Forgery Localization in the {Metaverse}", journal = j-TOMM, volume = "21", number = "2", pages = "55:1--55:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3686158", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3686158", abstract = "In recent years, the Metaverse has garnered significant attention in social and Metahuman realms, showcasing substantial value and immense developmental potential \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "55", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:CAG, author = "Yuanman Li and Lanhao Ye and Haokun Cao and Wei Wang and Zhongyun Hua", title = "Cascaded Adaptive Graph Representation Learning for Image Copy--Move Forgery Detection", journal = j-TOMM, volume = "21", number = "2", pages = "56:1--56:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3669905", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Feb 13 06:04:53 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3669905", abstract = "In the realm of image security, there has been a burgeoning interest in harnessing deep learning techniques for the detection of digital image copy-move forgeries, resulting in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "56", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2025:DNI, author = "Cong Hu and Xiao-Zhong Wei and Xiao-Jun Wu", title = "{DIRformer}: a Novel Image Restoration Approach Based on {U}-shaped Transformer and Diffusion Models", journal = j-TOMM, volume = "21", number = "2", pages = "57:1--57:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703632", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3703632", abstract = "Image restoration (IR) involves the retrieval of missing or damaged image information and represents a significant challenge in the field of visual reconstruction. Currently, U-Net based Diffusion Models (DMs) display favorable results when utilized for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "57", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:RDD, author = "Yuyu Xu and Pingping Zhang and Minghui Chen and Qiudan Zhang and Wenhui Wu and Yun Zhang and Xu Wang", title = "{RGB-D} Data Compression via Bi-Directional Cross-Modal Prior Transfer and Enhanced Entropy Modeling", journal = j-TOMM, volume = "21", number = "2", pages = "58:1--58:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702997", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3702997", abstract = "RGB-D data, being homogeneous cross-modal data, demonstrates significant correlations among data elements. However, current research focuses only on a uni-directional pattern of cross-modal contextual information, neglecting the exploration of bi-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "58", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:APS, author = "Jiayu Yang and Yongqi Zhai and Wei Jiang and Chunhui Yang and Feng Gao and Ronggang Wang", title = "Adaptive Prediction Structure for Learned Video Compression", journal = j-TOMM, volume = "21", number = "2", pages = "59:1--59:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703914", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3703914", abstract = "Learned video compression has developed rapidly and shown competitive rate-distortion performance compared with the latest traditional video coding standard H.266 (VVC). However, existing works were restricted to fixed prediction direction and GoP size. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "59", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:TCT, author = "Yifan Wang and Liang Feng and Fenglin Cai and Lusi Li and Rui Wu and Jie Li", title = "{TEC-CNN}: Toward Efficient Compressing of Convolutional Neural Nets with Low-rank Tensor Decomposition", journal = j-TOMM, volume = "21", number = "2", pages = "60:1--60:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702641", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3702641", abstract = "Most state-of-the-art convolutional neural networks (CNNs) are characterized by excessive parameterization, leading to a high computational burden. Tensor decomposition has emerged as a model reduction technique for compressing deep neural networks. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "60", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2025:PUR, author = "Chong-Yang Xiang and Xiao Wu and Jun-Yan He and Zhaoquan Yuan and Tingquan He", title = "Person in Uniforms Re-Identification", journal = j-TOMM, volume = "21", number = "2", pages = "61:1--61:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703839", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3703839", abstract = "Person in Uniforms Re-identification (PU-ReID) is an emerging computer vision task for various intelligent video surveillance applications. PU-ReID is much understudied due to the absence of large-scale annotated datasets, also this task is extremely \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "61", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:ADC, author = "Xiyao Liu and Cundian Yang and Jianbiao He and Hui Fang and Gerald Schaefer and Jian Zhang and Yuesheng Zhu and Shichao Zhang", title = "Attack-Defending Contrastive Learning for Volumetric Medical Image Zero-Watermarking", journal = j-TOMM, volume = "21", number = "2", pages = "62:1--62:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702230", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3702230", abstract = "Zero-watermarking is an emerging distortion-free copyright protection method for volumetric medical images. However, achieving both robustness against various malicious attacks and distinguishability between individual images remains challenging. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "62", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:DRR, author = "Anqi Cao and Zhijing Wan and Xiao Wang and Wei Liu and Wei Wang and Zheng Wang and Xin Xu", title = "Diversity-Representativeness Replay and Knowledge Alignment for Lifelong Vehicle Re-identification", journal = j-TOMM, volume = "21", number = "2", pages = "63:1--63:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702998", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3702998", abstract = "Lifelong Vehicle Re-Identification (LVReID) aims to match a target vehicle across multiple cameras, considering non-stationary and continuous data streams, which fits the needs of the practical application better than traditional vehicle re-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "63", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dongye:2025:DLH, author = "Xiaonuo Dongye and Haiyan Jiang and Dongdong Weng and Zhenliang Zhang", title = "Demonstrative Learning for Human-Agent Knowledge Transfer", journal = j-TOMM, volume = "21", number = "2", pages = "64:1--64:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703838", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3703838", abstract = "Demonstrative learning in virtual reality (VR) is a pivotal learning strategy for knowledge transfer for embodied agents. While existing studies have extensively explored agents' knowledge transfer through self-demonstrative learning (SDL) or teacher-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "64", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2025:GGS, author = "Chengxin Zhao and Hefei Ling and Jialie Shen and Han Fang and Sijing Xie and Yaokun Fang and Zongyi Li and Ping Li", title = "{GSyncCode}: Geometry Synchronous Hidden Code for One-step Photography Decoding", journal = j-TOMM, volume = "21", number = "2", pages = "65:1--65:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706060", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3706060", abstract = "Invisible hyperlinks and hidden barcodes have recently emerged as a hot topic in offline-to-online messaging, where an invisible message or barcode is embedded in an image and can be decoded via camera shooting. Current schemes involve a two-step decoding \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "65", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:DAM, author = "Xiaolin Chen and Xuemeng Song and Jianhui Zuo and Yinwei Wei and Liqiang Nie and Tat-Seng Chua", title = "Domain-aware Multimodal Dialog Systems with Distribution-based User Characteristic Modeling", journal = j-TOMM, volume = "21", number = "2", pages = "66:1--66:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3704811", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3704811", abstract = "Textual response generation is a pivotal yet challenging task for multimodal task-oriented dialog systems, which targets at generating the appropriate textual response given the multimodal context. Although existing efforts have obtained remarkable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "66", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:SGT, author = "Chenghao Li and Lei Qi and Xin Geng", title = "A {SAM}-guided Two-stream Lightweight Model for Anomaly Detection", journal = j-TOMM, volume = "21", number = "2", pages = "67:1--67:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706574", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3706574", abstract = "In industrial anomaly detection, model efficiency and mobile-friendliness become the primary concerns in real-world applications. Simultaneously, the impressive generalization capabilities of Segment Anything (SAM) have garnered broad academic attention, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "67", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:PGE, author = "Ji-Yan Wu and Kasun Gamlath and Archan Misra", title = "{Pr-Ge-Ne}: Efficient Encoding of Pervasive Video Sensing Streams by Pruned Generative Networks", journal = j-TOMM, volume = "21", number = "2", pages = "68:1--68:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706109", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3706109", abstract = "While video sensing, performed by resource-constrained pervasive devices, is a key enabler of many machine intelligence applications, the high energy and bandwidth overheads of streaming video transmission continue to present formidable deployment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "68", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ji:2025:BFM, author = "Wei Ji and Li Li and Zheqi Lv and Wenqiao Zhang and Mengze Li and Zhen Wan and Wenqiang Lei and Roger Zimmermann", title = "Backpropagation-Free Multi-modal On-Device Model Adaptation via Cloud-Device Collaboration", journal = j-TOMM, volume = "21", number = "2", pages = "69:1--69:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706422", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3706422", abstract = "In our increasingly interconnected world, where intelligent devices continually amass copious personalized multi-modal data, a pressing need arises to deliver high-quality, personalized device-aware services. However, this endeavor presents a multifaceted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "69", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Peng:2025:API, author = "Heqi Peng and Yunhong Wang and Ruijie Yang and Beichen Li and Rui Wang and Yuanfang Guo", title = "{AED-PADA}: Improving Generalizability of Adversarial Example Detection via Principal Adversarial Domain Adaptation", journal = j-TOMM, volume = "21", number = "2", pages = "70:1--70:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706061", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3706061", abstract = "Adversarial example detection, which can be conveniently applied in many scenarios, is important in the area of adversarial defense. Unfortunately, existing detection methods suffer from poor generalization performance because their training process \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "70", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:MCS, author = "Ning Xu and Xiaowen Wang and Jing Liu and Lanjun Wang and Xuanya Li and Mengxiao Zhu and Yongdong Zhang and An-An Liu", title = "Model Can Be Subtle: Two Important Mechanisms for Social Media Popularity Prediction", journal = j-TOMM, volume = "21", number = "2", pages = "71:1--71:??", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705319", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:28:52 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3705319", abstract = "Social media popularity prediction is an important channel to explore content sharing and communication on social networks. It aims to capture informative cues by analyzing multi-type data (such as user profile, image, and text) to decide the popularity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "71", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:LLS, author = "Jiapeng Wang and Zening Lin and Dayi Huang and Longfei Xiong and Lianwen Jin", title = "{LiLTv2}: Language-substitutable Layout-image Transformer for Visual Information Extraction", journal = j-TOMM, volume = "21", number = "3", pages = "72:1--72:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708351", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708351", abstract = "Visual Information Extraction (VIE) has experienced substantial growth and heightened interest due to its pivotal role in intelligent document processing. However, most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "72", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jin:2025:NIC, author = "Yili Jin and Jiahao Li and Bin Li and Yan Lu", title = "Neural Image Compression with Regional Decoding", journal = j-TOMM, volume = "21", number = "3", pages = "73:1--73:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708347", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708347", abstract = "As advancements are made in technology such as AR/VR and high-resolution photography, there is a growing need for a function in image compression named regional decoding. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "73", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:EEV, author = "Xiaotian Wu and Xinjie Feng and Bing Chen and Ching-Nung Yang and Qing-Yu Peng and Weiqi Yan", title = "{EVCS-DAS}: Evolving Visual Cryptography Schemes for Dynamic Access Structures", journal = j-TOMM, volume = "21", number = "3", pages = "74:1--74:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708547", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708547", abstract = "A systematic investigation of evolving visual cryptography scheme (EVCS) is carried out in this article. The evolving scheme, denoted as $ (k, \infty) $, differs from the $ (k, n) $ \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "74", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Talhaoui:2025:VDI, author = "Mohamed Zakariya Talhaoui and Zhelong Wang and Mohamed Amine Midoun and Abdelkarim Smaili and Djamel Eddine Mekkaoui and Mourad Lablack and Ke Zhang", title = "Vulnerability Detection and Improvements of an Image Cryptosystem for Real-Time Visual Protection", journal = j-TOMM, volume = "21", number = "3", pages = "75:1--75:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708546", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708546", abstract = "Chaos-based cryptosystems are regarded as highly secure techniques for image encryption. However, despite the considerable enhancement of encryption robustness provided by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "75", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:SAC, author = "Kai Xu and Lichun Wang and Shuang Li and Tong Gao and Baocai Yin", title = "Scene Adaptive Context Modeling and Balanced Relation Prediction for Scene Graph Generation", journal = j-TOMM, volume = "21", number = "3", pages = "76:1--76:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708350", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708350", abstract = "Scene graph generation (SGG) aims to perceive objects and their relations in images, which can bridge the gap between upstream detection tasks and downstream high-level visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "76", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Samrouth:2025:SNB, author = "Khouloud Samrouth and Pia {El Housseini} and Olivier Deforges", title = "{Siamese} Network-Based Detection of Deepfake Impersonation Attacks with a Person of Interest Approach", journal = j-TOMM, volume = "21", number = "3", pages = "77:1--77:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708352", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708352", abstract = "Deepfake technology presents critical cybersecurity challenges that have become more popular since easily accessible applications have become more widely available. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "77", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:MHA, author = "Yiping Yang and Baiyun Cui and Yingming Li", title = "A Multimodal Hierarchical Attentional Ordering Network", journal = j-TOMM, volume = "21", number = "3", pages = "78:1--78:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711864", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711864", abstract = "Sequence coherence modeling refers to how to coherently organize a given set of elements within a sequence, which is a fundamental aspect in comprehension, generation, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "78", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ruan:2025:LSA, author = "Haoxian Ruan and Zhihua Xu and Zhijing Yang and Yongyi Lu and Jinghui Qin and Tianshui Chen", title = "Learning Semantic-aware Representation in Visual-Language Models for Multi-label Recognition with Partial Labels", journal = j-TOMM, volume = "21", number = "3", pages = "79:1--79:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708991", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708991", abstract = "Multi-label recognition with partial labels (MLR-PL), in which only some labels are known while others are unknown for each image, is a practical task in computer vision, since \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "79", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2025:MMM, author = "Kun Yan and Zied Bouraoui and Fangyun Wei and Chang Xu and Ping Wang and Shoaib Jameel and Steven Schockaert", title = "Modeling Multi-modal Cross-interaction for Multi-label Few-shot Image Classification Based on Local Feature Selection", journal = j-TOMM, volume = "21", number = "3", pages = "80:1--80:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711867", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711867", abstract = "The aim of multi-label few-shot image classification (ML-FSIC) is to assign semantic labels to images, in settings where only a small number of training examples are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "80", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:MGC, author = "Yajie Liu and Pu Ge and Guodong Wang and Qingjie Liu and Di Huang", title = "Multi-Grained Contrastive Learning for Text-Supervised Open-Vocabulary Semantic Segmentation", journal = j-TOMM, volume = "21", number = "3", pages = "81:1--81:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711868", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711868", abstract = "Learning open-vocabulary semantic segmentation (OVSS) from text supervision has recently received increasing attention for its promising potential in real-world \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "81", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:BSA, author = "Yipei Chen and Hua Yuan and Baojun Ma and Limin Wang and Yu Qian", title = "Beyond Songs: Analyzing User Sentiment through Music Playlists and Multimodal Data", journal = j-TOMM, volume = "21", number = "3", pages = "82:1--82:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708346", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3708346", abstract = "The automatic recognition of user sentiments through their music listening behavior is an important research task in cognitive studies. Whereas prior studies were conducted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "82", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Niu:2025:SBG, author = "Yuzhen Niu and Yeyuan Xu and Yuezhou Li and Jiabang Zhang and Yuzhong Chen", title = "Skeleton-Boundary-Guided Network for Camouflaged Object Detection", journal = j-TOMM, volume = "21", number = "3", pages = "83:1--83:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711869", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711869", abstract = "Camouflaged object detection (COD) aims to resolve the tough issue of accurately segmenting objects hidden in the surroundings. However, the existing methods suffer from two \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "83", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:WDW, author = "Xiaofeng Zhang and Zishan Xu and Hao Tang and Chaochen Gu and Wei Chen and Abdulmotaleb {El Saddik}", title = "Wakeup-Darkness: When Multimodal Meets Unsupervised Low-Light Image Enhancement", journal = j-TOMM, volume = "21", number = "3", pages = "84:1--84:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711929", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711929", abstract = "Low-light image enhancement is a crucial visual task, and many unsupervised methods overlook the degradation of visible information in low-light scenes, adversely affecting the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "84", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tu:2025:DFT, author = "Jiahang Tu and Wei Ji and Hanbin Zhao and Chao Zhang and Roger Zimmermann and Hui Qian", title = "{DriveDiTFit}: Fine-tuning Diffusion Transformers for Autonomous Driving Data Generation", journal = j-TOMM, volume = "21", number = "3", pages = "85:1--85:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712064", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712064", abstract = "In autonomous driving, deep models have shown remarkable performance across various visual perception tasks with the demand of high-quality and huge-diversity training datasets. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "85", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiao:2025:UTI, author = "Yifan Jiao and Chenglong Cai and Bing-Kun Bao", title = "Unified Text-Image Space Alignment with Cross-Modal Prompting in {CLIP} for {UDA}", journal = j-TOMM, volume = "21", number = "3", pages = "86:1--86:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715699", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3715699", abstract = "Unsupervised Domain Adaptation (UDA) aims to transfer models trained on a labeled source domain to an unlabeled target domain. Due to the excellent generalization ability of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "86", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kou:2025:PFF, author = "Feifei Kou and Bingwei Wang and Haisheng Li and Chuangying Zhu and Lei Shi and Jiwei Zhang and Limei Qi", title = "Potential Features Fusion Network for Multimodal Fake News Detection", journal = j-TOMM, volume = "21", number = "3", pages = "87:1--87:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711866", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711866", abstract = "With the popularization of social networks, fake news is also widely and rapidly spreading, which poses a great threat to the Internet. Therefore, how to detect fake news automatically \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "87", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zou:2025:GHF, author = "Shihao Zou and Yuanlu Xu and Nikolaos Sarafianos and Federica Bogo and Tony Tung and Weixin Si and Li Cheng", title = "Generating High-Fidelity Clothed Human Dynamics with Temporal Diffusion", journal = j-TOMM, volume = "21", number = "3", pages = "88:1--88:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712011", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712011", abstract = "Clothed human modeling plays a crucial role in multimedia research, with applications spanning virtual reality, gaming, and fashion design. The goal is to learn clothed human \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "88", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:PNM, author = "Jiaxin Chen and Xin Liao and Zhenxing Qian and Zheng Qin", title = "{PRest-Net}: Multi-domain Probability Estimation Network for Robust Image Forgery Detection", journal = j-TOMM, volume = "21", number = "3", pages = "89:1--89:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711930", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711930", abstract = "As an important carrier of information transmission in online social networks (OSNs), the authenticity protection of images is of great significance. However, the abuse of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "89", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:MFA, author = "Qiang Li and Di Liu and Guang Zu and Sen Li and Hui Sun and Jianzhong Wang", title = "Multigranularity Feature Aggregation and Cross-level Boundary Modeling for Temporal Action Detection", journal = j-TOMM, volume = "21", number = "3", pages = "90:1--90:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712598", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712598", abstract = "This article presents a Temporal Action Detection (TAD) method with Multigranularity (MG) feature aggregation and Cross-level Boundary Modeling (CBM). Compared with other \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "90", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:NFR, author = "Lin Huang and Chuan Qin and Guorui Feng and Xiangyang Luo and Xinpeng Zhang", title = "New Framework of Robust Image Encryption", journal = j-TOMM, volume = "21", number = "3", pages = "91:1--91:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712601", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712601", abstract = "Designing an end-to-end encryption method for images using the non-linear properties of deep neural networks (DNNs) has gradually attracted the attention of researchers. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "91", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:TSC, author = "Jiayue Chen and Xiaomeng Wang and Tong Xu and Shiwei Wu", title = "Towards Scene-Centric Multi-Level Interest Mining for Video Recommendation", journal = j-TOMM, volume = "21", number = "3", pages = "92:1--92:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712600", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712600", abstract = "Knowledge-aware video recommendation requires the ability of associating external knowledge to capture high-order connectivities between users and videos. One \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "92", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lu:2025:MAC, author = "Xiusheng Lu and Yanbin Hao and Lechao Cheng and Sicheng Zhao and Yutao Liu and Mingli Song", title = "Mixed Attention and Channel Shift Transformer for Efficient Action Recognition", journal = j-TOMM, volume = "21", number = "3", pages = "93:1--93:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712594", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712594", abstract = "The practical use of the Transformer-based methods for processing videos is constrained by the high computing complexity. Although previous approaches adopt the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "93", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2025:DFF, author = "Haifeng Zhao and Chi Zhang and Deyin Liu and Lin Wu", title = "Deformation Field Fusion for Medical Image Registration", journal = j-TOMM, volume = "21", number = "3", pages = "94:1--94:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3707462", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3707462", abstract = "Deformable medical image registration is to find a series of non-linear spatial transformations to align a pair of fixed and moving voxel images. Deep learning based registration \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "94", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ou:2025:MMS, author = "Lisong Ou and Zhixin Li", title = "Multi-modal Sarcasm Detection on Social Media via Multi-Granularity Information Fusion", journal = j-TOMM, volume = "21", number = "3", pages = "95:1--95:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715139", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3715139", abstract = "The rising popularity of diverse social media platforms, commonly utilized by individuals to articulate their emotions in everyday interactions, has spurred a growing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "95", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2025:SRC, author = "Ao Fu and Jiaqi Zhao and Yong Zhou and Wenliang Du and Rui Yao and Abdulmotaleb {El Saddik}", title = "Similarity Regulation and Calibration Alignment for Weakly Supervised Text-Based Person Re-Identification", journal = j-TOMM, volume = "21", number = "3", pages = "96:1--96:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711861", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3711861", abstract = "Traditional text-based person re-identification relies on identity labels. However, it is impossible to annotate large datasets, since identity annotation is expensive and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "96", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2025:MLT, author = "Shaojun Zhu and Bincheng Zhu and Kaikai Chi and Jiefan Qiu and Hailong Shi and Xingyu Gao", title = "Maximizing Long-Term Task Completion Ratio of {UAV}-Enabled Wirelessly Powered {MEC} Systems", journal = j-TOMM, volume = "21", number = "3", pages = "97:1--97:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712599", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712599", abstract = "Unmanned Aerial Vehicle (UAV)-enabled wirelessly powered Mobile Edge Computing (MEC) is emerging as a powerful technology for boosting computational capability and energy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "97", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:DDD, author = "Xuanqing Cao and Wengang Zhou and Qi Sun and Weilun Wang and Li Li and Houqiang Li", title = "{DISA}: Disentangled Dual-Branch Framework for Affordance-Aware Human Insertion", journal = j-TOMM, volume = "21", number = "3", pages = "98:1--98:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715140", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3715140", abstract = "Affordance-aware human insertion is a controllable human synthesis task aimed at seamlessly integrating a person into a scene while aligning human pose with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "98", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Mameli:2025:RER, author = "Marco Mameli and Marina Paolanti and Adriano Mancini and Primo Zingaretti and Roberto Pierdicca", title = "{RenderGAN}: Enhancing Real-time Rendering Efficiency with Deep Learning", journal = j-TOMM, volume = "21", number = "3", pages = "99:1--99:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712263", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3712263", abstract = "In the domain of computer graphics, achieving high visual quality in real-time rendering remains a formidable challenge due to the inherent time-quality tradeoff. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "99", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2025:UUD, author = "Lv Tang and Xinfeng Zhang and Li Zhang", title = "{UVC}: a Unified Deep Video Compression Framework", journal = j-TOMM, volume = "21", number = "3", pages = "100:1--100:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715144", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3715144", abstract = "Recently, many works have applied deep learning techniques to video compression tasks, achieving promising results and advancing the field of Deep Learning-Based Video Compression \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "100", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:MSD, author = "Shen Wang and Yu Wang and Renjie Qiao and Kejun Wu and Chia-Wen Lin and Chengtao Cai", title = "Multi-Scale Dynamic Fusion for Visible-Infrared Person Re-Identification", journal = j-TOMM, volume = "21", number = "3", pages = "101:1--101:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715330", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Wed Mar 19 07:23:17 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", URL = "https://dl.acm.org/doi/10.1145/3715330", abstract = "Visible-infrared person re-identification (VI-ReID) aims to match persons across visible and infrared modalities; however, its performance is prone to complex dynamic scenes, such \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "101", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } %%% NB: article number 102 is missing from ACM Web site for v21 @Article{Guo:2025:ISI, author = "Dan Guo and Troy McDaniel and Shuhui Wang and Meng Wang", title = "Introduction to the Special Issue on Deep Learning for Robust Human Body Language Understanding", journal = j-TOMM, volume = "21", number = "4", pages = "103:1--103:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712012", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "103", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:SSR, author = "Jian Zhang and Kaihao He and Ting Yu and Jun Yu and Zhenming Yuan", title = "Semi-Supervised {RGB-D} Hand Gesture Recognition via Mutual Learning of Self-Supervised Models", journal = j-TOMM, volume = "21", number = "4", pages = "104:1--104:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689644", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Human hand gesture recognition is important to human-computer interaction. Gesture recognition based on RGB and Depth (RGB-D) data exploits both RGB and depth images to provide comprehensive results. However, the research under scenario with insufficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "104", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2025:GDC, author = "Shengeng Tang and Feng Xue and Jingjing Wu and Shuo Wang and Richang Hong", title = "Gloss-driven Conditional Diffusion Models for Sign Language Production", journal = j-TOMM, volume = "21", number = "4", pages = "105:1--105:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3663572", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sign Language Production (SLP) aims to convert text or audio sentences into sign language videos corresponding to their semantics, which is challenging due to the diversity and complexity of sign languages, and cross-modal semantic mapping issues. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "105", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:SAG, author = "Kaixin Chen and Lin Zhang and Zhong Wang and Shengjie Zhao and Yicong Zhou", title = "Skeleton-Aware Graph-Based Adversarial Networks for Human Pose Estimation from Sparse {IMUs}", journal = j-TOMM, volume = "21", number = "4", pages = "106:1--106:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3669904", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, sparse-inertial human pose estimation (SI-HPE) with only a few IMUs has shown great potential in various fields. The most advanced work in this area achieved fairish results using only six IMUs. However, there are still two major issues that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "106", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tu:2025:LFF, author = "Zhewei Tu and Xiangbo Shu and Peng Huang and Rui Yan and Zhenxing Liu and Jiachao Zhang", title = "Leveraging Frame- and Feature-level Progressive Augmentation for Semi-supervised Action Recognition", journal = j-TOMM, volume = "21", number = "4", pages = "107:1--107:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3655025", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Semi-supervised action recognition is a challenging yet prospective task due to its low reliance on costly labeled videos. One high-profile solution is to explore frame-level weak/strong augmentations for learning abundant representations, inspired by the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "107", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2025:JMD, author = "Linhua Xiang and Zengfu Wang", title = "Joint Mixing Data Augmentation for Skeleton-Based Action Recognition", journal = j-TOMM, volume = "21", number = "4", pages = "108:1--108:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3700878", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Skeleton-based action recognition is beneficial for understanding human behavior in videos, and thus has received much attention in recent years as an important research area in action recognition. Current research focuses on designing more advanced \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "108", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2025:FRB, author = "Zenan Shi and Wenyu Liu and Haipeng Chen", title = "Face Reconstruction-Based Generalized Deepfake Detection Model with Residual Outlook Attention", journal = j-TOMM, volume = "21", number = "4", pages = "109:1--109:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3686162", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the continuous development of deep counterfeiting technology, the information security in our daily life is under serious threat. While existing face forgery detection methods exhibit impressive accuracy when applied to datasets such as FaceForensics+. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "109", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2025:DSB, author = "Peng He and Jun Yu and Chengjie Ge and Ye Yu and Wei Xu and Lei Wang and Tianyu Liu and Zhen Kan", title = "Domain-Separated Bottleneck Attention Fusion Framework for Multimodal Emotion Recognition", journal = j-TOMM, volume = "21", number = "4", pages = "110:1--110:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711865", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As a focal point of research in various fields, human body language understanding has long been a subject of intense interest. Within this realm, the exploration of emotion recognition through the analysis of facial expressions, voice patterns, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "110", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gan:2025:GAN, author = "Yan Gan and Chenxue Yang and Mao Ye and Renjie Huang and Deqiang Ouyang", title = "Generative Adversarial Networks with Learnable Auxiliary Module for Image Synthesis", journal = j-TOMM, volume = "21", number = "4", pages = "111:1--111:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3653021", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Apr 28 09:11:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Training generative adversarial networks (GANs) for noise-to-image synthesis is a challenge task, primarily due to the instability of GANs' training process. One of the key issues is the generator's sensitivity to input data, which can cause sudden \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "111", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:MMP, author = "Wei Liu and Xin Xu and Hua Chang and Xin Yuan and Zheng Wang", title = "Mix-Modality Person Re-Identification: a New and Practical Paradigm", journal = j-TOMM, volume = "21", number = "4", pages = "112:1--112:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715142", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current visible-infrared cross-modality person re-identification research has only focused on exploring the bi-modality mutual retrieval paradigm, and we propose a new and more practical mix-modality retrieval paradigm. Existing Visible-Infrared Person \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "112", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:GHH, author = "Nianzi Li and Guijuan Zhang and Ping Du and Dianjie Lu", title = "{GP-HSI}: Human-Scene Interaction with Geometric and Physical Constraints", journal = j-TOMM, volume = "21", number = "4", pages = "113:1--113:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid development of AR/VR technologies, achieving natural and seamless human-scene interactions has emerged as a critical challenge in computer vision. Existing methods suffer from low model placement accuracy and unnatural scene interactions. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "113", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2025:LGB, author = "Enyuan Zhao and Ning Song and Ze Zhang and Jie Nie and Xinyue Liang and Zhiqiang Wei", title = "Language-guided Bias Generation Contrastive Strategy for Visual Question Answering", journal = j-TOMM, volume = "21", number = "4", pages = "114:1--114:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715141", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Visual question answering (VQA) is a challenging task that requires models to understand both visual and linguistic inputs and produce accurate answers. However, VQA models often exploit biases in datasets to make predictions rather than reasoning based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "114", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:DDT, author = "Kun Wang and Jiuxin Cao and Jiawei Ge and Chang Liu and Bo Liu", title = "Dual-Domain Triple Contrast for Cross-Dataset Skeleton-Based Action Recognition", journal = j-TOMM, volume = "21", number = "4", pages = "115:1--115:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715917", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Skeleton-based Action Recognition (SAR) is widely recognized for its robustness and efficiency in human action analysis, but its performance in cross-dataset tasks has been limited due to domain shifts between different datasets. To address this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "115", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:TSG, author = "Runing Li and Jiangyan Dai and Qibing Qin and Chengduan Wang and Huihui Zhang and Yugen Yi", title = "Texture and Structure-Guided Dual-Attention Mechanism for Image Inpainting", journal = j-TOMM, volume = "21", number = "4", pages = "116:1--116:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715962", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep learning exhibits powerful capability in image inpainting task, particularly in generating pixel-level details closely with the human visual perception. However, the complex background or larger missing regions make it still encounters the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "116", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:AVS, author = "Nana Zhang and Min Xiong and Dandan Zhu and Kun Zhu and Guangtao Zhai and Xiaokang Yang", title = "Audio-Visual Saliency Prediction Model with Implicit Neural Representation", journal = j-TOMM, volume = "21", number = "4", pages = "117:1--117:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3698881", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the remarkable advancement of deep learning techniques and the wide availability of large-scale datasets, the performance of audio-visual saliency prediction has been drastically improved. Actually, audio-visual saliency prediction is still at an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "117", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:TBA, author = "Zhenqiang Zhang and Kun Li and Shengeng Tang and Yanyan Wei and Fei Wang and Jinxing Zhou and Dan Guo", title = "Temporal Boundary Awareness Network for Repetitive Action Counting", journal = j-TOMM, volume = "21", number = "4", pages = "118:1--118:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712602", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Repetitive Action Counting (RAC) is a critical and challenging task in video analysis, aiming to count the number of repeated actions in videos accurately. Existing methods typically generate a Temporal Self-similarity Matrix (TSM) as an intermediate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "118", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:MPA, author = "Zicheng Zhang and Yingjie Zhou and Chunyi Li and Wei Sun and Xiongkuo Min and Xiaohong Liu and Guangtao Zhai", title = "{MM-PCQA+}: Advancing Multi-Modal Learning for Point Cloud Quality Assessment", journal = j-TOMM, volume = "21", number = "4", pages = "119:1--119:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715134", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The importance of visual quality in point clouds has been significantly underlined due to the rapid rise in 3D vision applications which aim to deliver affordable and superior user experiences. Reviewing the evolution of point cloud quality assessment \ldots{}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "119", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cui:2025:LLE, author = "Xiao Cui and Qi Sun and Min Wang and Li Li and Wengang Zhou and Houqiang Li", title = "{LayoutEnc}: Leveraging Enhanced Layout Representations for Transformer-based Complex Scene Synthesis", journal = j-TOMM, volume = "21", number = "4", pages = "120:1--120:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716389", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In complex scene synthesis, the effective representation of layouts is paramount. This paper introduces LayoutEnc, an advanced approach specifically designed to enhance layout representation by improving interpretability, robustness, and expressiveness, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "120", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Raju:2025:ABF, author = "Chintha Sri Pothu Raju and Rabul Hussain Laskar and Zulfiqar Ali and Ghulam Muhammad", title = "Attention-based Fusion for Stroke Lesion Segmentation on Computed Tomography Perfusion Data", journal = j-TOMM, volume = "21", number = "4", pages = "121:1--121:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716632", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recent times, stroke has emerged as a significant threat to humans, transforming affected brain tissue into core and penumbra regions. As the penumbra becomes irreversible over time, early core region segmentation is crucial. Automatic segmentation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "121", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:MAO, author = "Qianxing Li and Dehui Kong and Jinghua Li and Dongpan Chen and Baocai Yin", title = "Multi-Anchor Offset Representation Based Coarse-to-Fine Diffusion Model for Human Pose Estimation", journal = j-TOMM, volume = "21", number = "4", pages = "122:1--122:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716387", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "3D human pose estimation (3DHPE) in images aims at estimating 3D joint positions from images. The existing 3DHPE methods usually define the loss function as the error measured by Euclidean distance between the locations of the predicted joints and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "122", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ahmad:2025:CLC, author = "Wasim Ahmad and Yan-Tsung Peng and Yuan-Hao Chang and Gaddisa Olani Ganfure and Sarwar Khan", title = "{CapST}: Leveraging Capsule Networks and Temporal Attention for Accurate Model Attribution in Deep-fake Videos", journal = j-TOMM, volume = "21", number = "4", pages = "123:1--123:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715138", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep-fake videos, generated through AI face-swapping techniques, have garnered considerable attention due to their potential for impactful impersonation attacks. While existing research primarily distinguishes real from fake videos, attributing a deep-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "123", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2025:GDG, author = "Zekun Sun and Na Ruan", title = "{GANK}: Dynamic Geometric and Appearance Features for Efficient and Robust Detection of Face Forgery", journal = j-TOMM, volume = "21", number = "4", pages = "124:1--124:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716827", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deepfakes refers to various deep-learning-based techniques that manipulate the face in videos. Maliciously manufactured face forgeries could result in serious problems such as portrait infringement, information confusion, or even public panic. Previous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "124", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2025:ICC, author = "Hancheng Zhu and Li Yan and Yong Zhou and Rui Yao and Zhiwen Shao and Jiaqi Zhao and Leida Li", title = "Image Cropping with Content and Composition Attribute-aware Global Relation Reasoning", journal = j-TOMM, volume = "21", number = "4", pages = "125:1--125:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719012", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image cropping aims to find visually pleasing content in an image, which will enhance its aesthetic quality. Existing image cropping approaches mainly emphasize the geometric properties of images, such as composition and layout, neglecting the rich \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "125", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wen:2025:LGR, author = "Wenying Wen and Yu Ye and Ziye Yuan and Baolin Qiu and Dingli Hua", title = "{LFIZW-GRHFMR}: Robust Zero-Watermarking with {GRHFMR} for Light Field Image", journal = j-TOMM, volume = "21", number = "4", pages = "126:1--126:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3717066", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Light field (LF) image data potentially involves a lot of sensitive information about users. Its transmission channel breaches could compromise user privacy and implicate illegal activities. Therefore, the confidentiality and integrity of LF image data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "126", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:CAR, author = "Fan Chen and Lingfeng Qu and Hadi Amirpour and Christian Timmerer and Hongjie He", title = "Counterfeiting Attacks on an {RDH-EI} Scheme Based on Block-Permutation and {Co-XOR}", journal = j-TOMM, volume = "21", number = "4", pages = "127:1--127:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719294", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Reversible data hiding in encrypted images (RDH-EI) has gained widespread attention due to its potential applications in secure cloud storage. However, the security challenges of RDH-EI in cloud storage scenarios remain largely unexplored. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "127", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:FAS, author = "Shangrong Yang and Chunyu Lin and Kang Liao and Yao Zhao", title = "{FishFormer}: Annulus Slicing-based Transformer for Fisheye Rectification", journal = j-TOMM, volume = "21", number = "4", pages = "128:1--128:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719348", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Numerous significant progress on fisheye image rectification has been achieved through CNN. Nevertheless, constrained by a fixed receptive field, the global distribution and the local symmetry of the distortion have not been fully exploited. To leverage \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "128", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:TFS, author = "Jiahui Wang and Qin Xu and Bo Jiang and Bin Luo", title = "Transductive Few-shot Learning via Joint Message Passing and Prototype-based Soft-label Propagation", journal = j-TOMM, volume = "21", number = "4", pages = "129:1--129:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719204", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The transductive Few-shot Learning (FSL) mostly employs either prototype learning or label propagation methods to generalize to new classes by using the information of all query samples. However, existing methods have several main limitations. First, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "129", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:PGB, author = "Jie Wang and Tingfa Xu and Liqiang Song and Lihe Ding and Hui Li and Peng Jiang and Yuqi Han and Jianan Li", title = "{PAPooling}: Graph-based Position Adaptive Aggregation of Local Geometry in Point Clouds", journal = j-TOMM, volume = "21", number = "4", pages = "130:1--130:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3718742", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Fine-grained geometry, obtained through the assimilation of localized point features, is crucial in the realms of object recognition and scene comprehension within point cloud contexts. Traditional point cloud backbones predominantly utilize max pooling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "130", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2025:TCI, author = "Tao Song and Kunlin Yang and Fan Meng and Xin Li and Handan Sun and Chenglizhao Chen", title = "Tropical Cyclone Image Super-Resolution via Multimodality Fusion", journal = j-TOMM, volume = "21", number = "4", pages = "131:1--131:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3714471", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The traditional super-resolution dataset construction using artificial down-sampling techniques can result in information loss, insufficient diversity, and non-uniqueness. Furthermore, existing methods for image super-resolution are limited to single-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "131", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2025:DPC, author = "Qianjiang Hu and Wei Hu", title = "Dynamic Point Cloud Denoising via Gradient Fields", journal = j-TOMM, volume = "21", number = "4", pages = "132:1--132:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721431", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "3D dynamic point clouds provide a discrete representation of real-world objects or scenes in motion, which have been widely applied in immersive telepresence, autonomous driving, surveillance, and so on. However, point clouds acquired from sensors are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "132", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:AEM, author = "Jiannan Huang and Mengxue Qu and Longfei Li and Yunchao Wei", title = "{AdGPT}: Explore Meaningful Advertising with {ChatGPT}", journal = j-TOMM, volume = "21", number = "4", pages = "133:1--133:??", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3720546", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:05:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Advertising is pervasive in everyday life. Some advertisements are not as readily comprehensible, as they convey a deeper message or purpose, which is referred to as ``meaningful advertising.'' These ads often aim to create an emotional connection with the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "133", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wen:2025:PBI, author = "Chao Wen and Chen Wei and Yuhua Qian and Xiaodan Song and Xuemei Xie", title = "Prompt-Based Invertible Mapping Alignment for Unsupervised Domain Adaptation", journal = j-TOMM, volume = "21", number = "5", pages = "134:1--134:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3725735", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Large pre-trained vision-language models (VLMs) like CLIP have shown great potential for solving the unsupervised domain adaptation (UDA) problem. Existing prompt learning for UDA based on the unsupervised-trained VLMs requires distribution alignment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "134", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Deng:2025:ITA, author = "Jiacheng Deng and Dengpan Ye and Jizhi Li and Ziyi Liu and Long Tang and Yunming Zhang", title = "The Interpretable and Transferable Adversarial Attack against Synthetic Speech Detectors", journal = j-TOMM, volume = "21", number = "5", pages = "135:1--135:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727341", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Existing work finds it challenging for adversarial examples to transfer among different synthetic speech detectors because of cross-feature and cross-model. To enhance the transferability of adversarial examples, we propose a spectral saliency analysis \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "135", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ge:2025:BVC, author = "Jiawei Ge and Jiuxin Cao and Xiangmei Chen and Xuelin Zhu and Weijia Liu and Chang Liu and Kun Wang and Bo Liu", title = "Beyond Visual Cues: Synchronously Exploring Target-Centric Semantics for Vision-Language Tracking", journal = j-TOMM, volume = "21", number = "5", pages = "136:1--136:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3726529", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Single object tracking aims to locate one specific target in video sequences, given its initial state. Classical trackers rely solely on visual cues, restricting their ability to handle challenges such as appearance variations, ambiguity, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "136", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shi:2025:RNN, author = "Mengyu Shi and Miao Wang and Yujun Zhang", title = "{RePC}: a Novel Neural Video Quality Enhancement System Framework for {ABR} Streaming of {VBR}-encoded Videos", journal = j-TOMM, volume = "21", number = "5", pages = "137:1--137:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727879", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the emergence of next-generation video applications and increasing spatial resolutions, delivering high-quality video is still limited by network bandwidth. Adaptive bitrate (ABR) can select the appropriate bitrate for video streaming based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "137", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Takezoe:2025:CAA, author = "Rinyoichi Takezoe and Hao Chen and Gang Shen and Xuefei Lv and Yaowei Wang and Shiliang Zhang and Xiaoyu Wang", title = "Context-Assisted Active Learning for Weakly Supervised Person Search", journal = j-TOMM, volume = "21", number = "5", pages = "138:1--138:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3714413", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Person search is a challenging task that aims to jointly detect and identify a target person from a large-scale scene image dataset. Fully supervised person search requires both bounding boxes and person identity annotations, making it hard to deploy in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "138", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:MMD, author = "Yang Wang and Yixing Zhang and Xudie Ren and Yuxin Deng", title = "{MoDA}: Mixture of Domain Adapters for Parameter-efficient Generalizable Person Re-identification", journal = j-TOMM, volume = "21", number = "5", pages = "139:1--139:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712595", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The Domain Generalizable Re-identification (DG ReID) task has attracted significant attention in recent years, as a challenging task but closely aligned with practical applications. Mixture-of-experts (MoE)-based methods have been studied for DG ReID to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "139", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2025:CAD, author = "Jiebin Yan and Ziwen Tan and Jiale Rao and Lei Wu and Yifan Zuo and Yuming Fang", title = "Computational Analysis of Degradation Modeling in Blind Panoramic Image Quality Assessment", journal = j-TOMM, volume = "21", number = "5", pages = "140:1--140:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3720547", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Blind panoramic image quality assessment (BPIQA) has recently brought a new challenge to the visual quality community, due to the complex interaction between immersive content and human behavior. Although many efforts have been made to advance BPIQA from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "140", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Feng:2025:ASS, author = "Yuchao Feng and Mengjie Qin and Jiawei Jiang and Jintao Lai and Jianwei Zheng", title = "Axial-shunted Spatial-temporal Conversation for Change Detection", journal = j-TOMM, volume = "21", number = "5", pages = "141:1--141:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721135", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Benefitting from the maturing of intelligence techniques and advanced sensors, recent years have witnessed the full flourishing of change detection (CD) on multi-temporal remote sensing images. However, extraneous interference caused by normal temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "141", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jiang:2025:MLC, author = "Wei Jiang and Jiayu Yang and Yongqi Zhai and Feng Gao and Ronggang Wang", title = "{MLIC++}: Linear Complexity Multi-Reference Entropy Modeling for Learned Image Compression", journal = j-TOMM, volume = "21", number = "5", pages = "142:1--142:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719011", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The latent representation in learned image compression encompasses channel-wise, local spatial, and global spatial correlations, which are essential for the entropy model to capture for conditional entropy minimization. Efficiently capturing these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "142", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhuang:2025:MMS, author = "Xingjie Zhuang and Fengling Zhou and Zhixin Li", title = "Multi-Modal Sarcasm Detection via Knowledge-Aware Focused Graph Convolutional Networks", journal = j-TOMM, volume = "21", number = "5", pages = "143:1--143:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722115", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multi-Modal Sarcasm Detection (MSD) aims to combine multiple modal information to identify implicit sarcastic sentiment. However, the significance of commonsense knowledge in implicit emotion recognition has been frequently overlooked. Additionally, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "143", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:TEE, author = "Xu Liu and Na Xia and Jinxing Zhou and Zhangbin Li and Dan Guo", title = "Towards Energy-efficient Audio-visual Classification via Multimodal Interactive Spiking Neural Network", journal = j-TOMM, volume = "21", number = "5", pages = "144:1--144:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721981", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The Audio-visual Classification (AVC) task aims to determine video categories by integrating audio and visual signals. Traditional methods for AVC leverage Artificial Neural Networks (ANNs) that operate on floating-point features, affording large \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "144", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yan:2025:VUB, author = "Jiebin Yan and Kangcheng Wu and Junjie Chen and Ziwen Tan and Yuming Fang and Weide Liu", title = "Viewport-Unaware Blind Omnidirectional Image Quality Assessment: a Flexible and Effective Paradigm", journal = j-TOMM, volume = "21", number = "5", pages = "145:1--145:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723165", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Most of the existing blind omnidirectional image quality assessment (BOIQA) models rely on viewport generation by modeling user viewing behavior or transforming omnidirectional images (OIs) into varying formats; however, these methods are either \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "145", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hua:2025:LAR, author = "Xuecheng Hua and Ke Cheng and Gege Zhu and Hu Lu and Yuanquan Wang and Shitong Wang", title = "Local-Aware Residual Attention Vision Transformer for Visible-Infrared Person Re-Identification", journal = j-TOMM, volume = "21", number = "5", pages = "146:1--146:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723358", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Visible-infrared person re-identification (VI-ReID) task is to retrieve the same pedestrian across the visible and infrared modalities. The existing transformer-based works are constrained by the inherent structure of the ViT that feature collapse in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "146", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jing:2025:INT, author = "Taotao Jing and Haifeng Xia and Hongfu Liu and Zhengming Ding", title = "Interpretable Novel Target Discovery through Open-Set Domain Adaptation", journal = j-TOMM, volume = "21", number = "5", pages = "147:1--147:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722557", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Open-set domain adaptation (OSDA) considers a special domain adaptation problem in which the target domain contains novel categories that never appear in the well-labeled source domain. Unfortunately, prior efforts on OSDA simply detect and recognize all \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "147", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:VFI, author = "Dengyong Zhang and Runqi Lou and Jiaxin Chen and Xiangling Ding and Xin Liao and Gaobo Yang", title = "Video Frame Interpolation via Fast Bidirectional {$3$D} Correlation Volume", journal = j-TOMM, volume = "21", number = "5", pages = "148:1--148:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3724123", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, there has been a growing demand for flow-based video frame interpolation methods, which introduce correlation volumes to supervise the correlation of bidirectional optical flows. However, they often overlook the symmetry of the bidirectional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "148", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:CDS, author = "Yan Wang and Hong Xie and Jinyang He and Xiaoyu Shi and Mingsheng Shang", title = "Cross-Domain Semantic Transfer for Domain Generalization", journal = j-TOMM, volume = "21", number = "5", pages = "149:1--149:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3724398", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Data augmentation is a kind of mainstream domain generalization method aimed at enhancing the model's ability to learn from out-of-distribution data. Most existing data augmentation methods fail to simultaneously preserve the semantic consistency and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "149", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2025:TSC, author = "Kang Lin and Wei Zhou and Zhijie Zheng and Dihu Chen and Tao Su", title = "Temporal and Semantic Correlation Network for Weakly-Supervised Temporal Action Localization", journal = j-TOMM, volume = "21", number = "5", pages = "150:1--150:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Weakly-Supervised Temporal Action Localization (WTAL) aims to identify the temporal boundaries and classify actions in untrimmed videos using only video-level labels during training. Despite recent progress, many existing approaches primarily follow a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "150", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ye:2025:RRA, author = "Zhaoda Ye and Xiangteng He and Yuxin Peng", title = "{RaT2IGen}: Relation-aware Text-to-image Generation via Learnable Prompt", journal = j-TOMM, volume = "21", number = "5", pages = "151:1--151:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3726527", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Text-to-image generation is to generate photo-realistic images according to the given text descriptions by users. Current methods have achieved promising performance. However, these methods still fail to generate the correct relation of the objects in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "151", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:SPE, author = "Mohan Zhou and Yalong Bai and Qing Yang and Tiejun Zhao", title = "{StyleInject}: Parameter Efficient Tuning of Text-to-Image Diffusion Models", journal = j-TOMM, volume = "21", number = "5", pages = "152:1--152:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730403", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The ability to fine-tune generative models for text-to-image generation tasks is crucial, particularly when facing the complexity involved in accurately interpreting and visualizing textual inputs. While LoRA is efficient for language model adaptation, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "152", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2025:DHQ, author = "Dongjian Yu and Weiqing Min and Xin Jin and Qian Jiang and Ying Jin and Shuqiang Jiang", title = "Diverse and High-Quality Food Image Generation from Only Food Names", journal = j-TOMM, volume = "21", number = "5", pages = "153:1--153:??", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730588", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jun 14 17:01:59 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Food image generation holds promising application prospects in food design, advertising, and food education. However, the existing methods rely on information such as recipes, ingredients, or food names, which leads to generated food images with less \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "153", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hsu:2025:MAF, author = "Wei-Yen Hsu and Yu-Chieh Chen", title = "Multi-Attribute Feature-Aware Network for Facial Expression Recognition", journal = j-TOMM, volume = "21", number = "6", pages = "154:1--154:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735559", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Facial expression recognition (FER) has gained popularity as a research topic due to its broad applicability. However, real-world environments present significant challenges to FER, including occlusion, illumination variation, and angle. To address these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "154", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fan:2025:SRR, author = "Linlin Fan and Mingliang Zhou and Xuekai Wei and Yong Feng and Tao Xiang and Bin Fang and Zhaowei Shang and Fan Jia and Xu Zhuang and Huayan Pu and Jun Luo", title = "Sparse Reduced-Rank Fully Connected Layers with Its Applications in Detection and Classification", journal = j-TOMM, volume = "21", number = "6", pages = "155:1--155:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727982", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Fully connected (FC) layers play a significant role in deep neural networks (DNNs) models. Owing to the complexity of its parameters, an FC layer has sufficient capacity to manage high-dimensional tasks, so a large amount of memory and powerful computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "155", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fani:2025:TSV, author = "Davoud Fani and Aliasghar Beheshti-Shirazi and Mohammad Ghanbari and Esmatollah Rezaei", title = "On Temporal Smoothness of Video Reconstruction Quality in the {DCVS} via Non-Uniform Sampling", journal = j-TOMM, volume = "21", number = "6", pages = "156:1--156:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729270", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "To the distributed video coding approach, which focuses on fully or partially shifting the computational complexity from video encoder to the decoder, the simplicity and highly compact sampling in emerging compressive sensing appear to be very efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "156", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:CEC, author = "I-Chun Huang and Yuang Shi and Yuan-Chun Sun and Wei Tsang Ooi and Chun-Ying Huang and Cheng-Hsin Hsu", title = "Composing Error Concealment Pipelines for Dynamic {3D} Point Cloud Streaming", journal = j-TOMM, volume = "21", number = "6", pages = "157:1--157:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3731561", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Dynamic 3D point clouds enable the immersive user experience and thus have become increasingly more popular in volumetric video streaming applications. When being streamed over best-effort networks, point cloud frames may suffer from lost or late packets,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "157", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:VLT, author = "Jie Li and Zhixia Zhao and Qiyue Li and Zhixin Li and Pengyuan Zhou and Zhi Liu and Hao Zhou and Zhu Li", title = "{VPFormer}: Leveraging Transformer with Voxel Integration for Viewport Prediction in Volumetric Video", journal = j-TOMM, volume = "21", number = "6", pages = "158:1--158:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730402", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the continuous advancement of computer vision, image processing technologies, volumetric video, represented by point cloud videos, holds the potential for extensive applications in areas such as Virtual Reality (VR) and Augmented Reality (AR). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "158", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Willis:2025:EHC, author = "Nina Willis and Abraham Bernstein and Luca Rossetto", title = "Effects of Human Cognition-Inspired Task Presentation on Interactive Video Retrieval", journal = j-TOMM, volume = "21", number = "6", pages = "159:1--159:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727983", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Interactive video retrieval is a cooperative process between humans and retrieval systems. Large-scale evaluation campaigns, however, often overlook human factors, such as the effects of perception, attention, and memory, when assessing media retrieval \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "159", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:DEH, author = "Donglin Zhang and Chang-Xing Li and Mengke Li and Zhikai Hu", title = "Discrete Elective Hashing with Incomplete Labels for Efficient Cross-Modal Retrieval", journal = j-TOMM, volume = "21", number = "6", pages = "160:1--160:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736414", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, supervised cross-modal hashing methods have gained considerable attention due to their ability to mine credible semantic relationships between multi-modal data. These methods typically rely on labels to explore semantic relationships provided \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "160", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2025:DCM, author = "Bowen Sun and Guo Lu and Shibao Zheng", title = "{DiFace}: Cross-Modal Face Recognition through Controlled Diffusion", journal = j-TOMM, volume = "21", number = "6", pages = "161:1--161:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3732288", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Diffusion Probabilistic Models (DPMs) have exhibited exceptional proficiency in generating visual media of outstanding quality and realism. Nonetheless, their potential in non-generative domains, such as Face Recognition (FR), has yet to be thoroughly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "161", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2025:FGS, author = "Jiajia Tang and Binbin Ni and Feiwei Zhou and Dongjun Liu and Yu Ding and Yong Peng and Andrzej Cichocki and Qibin Zhao and Wanzeng Kong", title = "Fine-grained Semantic Disentanglement Network for Multimodal Sarcasm Analysis", journal = j-TOMM, volume = "21", number = "6", pages = "162:1--162:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722558", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multimodal sarcasm analysis is one of the most challenging research branch of the sentiment analysis area, due to the presence of cross-modality incongruity. However, existing works mainly attend to the coarse-grained incongruity analysis, and totally \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "162", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ren:2025:SDC, author = "Peng Ren and Yunfeng Bai and Xiaoheng Li and Jinyuan Jia", title = "Semantic-driven Cross-space Graph Interaction Network for Fine-grained {3D} Point Cloud Understanding", journal = j-TOMM, volume = "21", number = "6", pages = "163:1--163:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735560", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Since irregular 3D point clouds inherently lack connected relations, most approaches focus less on low-level spherical geometric features and high-level distant semantic feature dependencies and interactions, leading to inadequate feature representations \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "163", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rota:2025:SRL, author = "Claudio Rota and Marco Buzzelli and Simone Bianco and Raimondo Schettini", title = "Scalable Residual {Laplacian} Network for {HEVC-compressed} Video Restoration", journal = j-TOMM, volume = "21", number = "6", pages = "164:1--164:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727147", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "We present a novel Convolutional Neural Network that exploits the Laplacian decomposition technique, which is typically used in traditional image processing, to restore videos compressed with the High-Efficiency Video Coding (HEVC) algorithm. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "164", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:IFG, author = "Shuo Wang and Jinda Lu and Huixia Ben and Yanbin Hao and Xingyu Gao and Meng Wang", title = "Interventional Feature Generation for Few-shot Learning", journal = j-TOMM, volume = "21", number = "6", pages = "165:1--165:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729171", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Few-shot learning (FSL) aims to classify a novel object into a specific category under limited training samples. This is a challenging task since (1) the features expressed by pre-trained knowledge introduce perceived bias and then constrain the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "165", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2025:MMA, author = "Lisi Wei and Libo Zhao and Xiaoli Zhang", title = "{MAINet}: Modality-Aware Interaction Network for Medical Image Fusion", journal = j-TOMM, volume = "21", number = "6", pages = "166:1--166:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3731247", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Due to the limitations of imaging sensors, obtaining a medical image that simultaneously captures both functional metabolic data and structural tissue details remains a significant challenge in clinical diagnosis. To address this, Multimodal Medical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "166", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:SMS, author = "Yuxuan Zhou and Mingyang Li and Jingze Tong and Linlin Li and Zhiwei Yang", title = "{SD-Meta}: The Software-Defined Network of Human-Centric Metaverse for Multi-Lead or Multi-Media Data in Spread Spectrum Communications", journal = j-TOMM, volume = "21", number = "6", pages = "167:1--167:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703913", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study proposes the concept of an adjacent two-end or multi-end link of a software-defined network to support the transmission of data from electroencephalogram as well as audio and video streaming through spread spectrum communications. Instead of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "167", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ansar:2025:TFT, author = "Wazib Ansar and Saptarsi Goswami and Amlan Chakrabarti and Basabi Chakraborty", title = "{TexIm FAST}: Text-to-Image Encoding for Semantic Similarity Evaluation of Disproportionate Sequences", journal = j-TOMM, volume = "21", number = "6", pages = "168:1--168:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735974", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "One of the principal objectives of Natural Language Processing (NLP) is to generate meaningful representations from text. Improving the informativeness of the representations has led to a tremendous rise in the dimensionality and the memory footprint. It \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "168", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Du:2025:EEI, author = "Qianqian Du and Hui Yin and Lang Nie and Yanting Liu and Jin Wan", title = "{EnIter}: Enhancing Iterative Multi-View Depth Estimation with Universal Contextual Hints", journal = j-TOMM, volume = "21", number = "6", pages = "169:1--169:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3731760", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Iterative inference approaches have shown promising success in the task of multi-view depth estimation. However, these methods put excessive emphasis on the universal inter-view correspondences while neglecting the correspondence ambiguity in regions of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "169", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:RRE, author = "Tong Wu and Jinhua Zhu and Wengang Zhou and Houqiang Li", title = "{RESIST}: Rationale-Enhanced and Reward Model-Based End-to-End Social Influence Dialogue System", journal = j-TOMM, volume = "21", number = "6", pages = "170:1--170:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736580", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Developing proactive social influence dialogue systems presents a significant challenge, particularly in non-cooperative scenarios where the system's goals may conflict with those of the user. Traditional methods often focus on training models to plan \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "170", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:DAS, author = "Yongxin Wang and Feng Dong and Zhen-Duo Chen and Xin Luo and Xin-Shun Xu", title = "Domain-Aware Semantic Alignment Hashing for Large-Scale Zero-Shot Image Retrieval", journal = j-TOMM, volume = "21", number = "6", pages = "171:1--171:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734871", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hashing has been proven to be effective in the field of large-scale image retrieval. However, traditional hashing is stuck in performance dilemmas under zero-shot scenarios due to the concept shift problem. Although some zero-shot hashing methods exploit \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "171", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cui:2025:CTR, author = "Jia Cui and Jinchen Shen and Jialin Wei and Shiyu Liu and Zhaojia Ye and Shijian Luo and Zhen Qin", title = "Community Transferrable Representation Learning for Image Style Classification", journal = j-TOMM, volume = "21", number = "6", pages = "172:1--172:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Style is prone to experience but hard to formulate, even for design professionals. The current style classification (SC) methods can be categorized into mono-tasking (style) and multi-tasking (style, artists, genre, and other additional clues). However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "172", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yin:2025:JST, author = "Qian Yin and Xinfeng Zhang and Ruoke Yan and Yuhuai Zhang and Shanshe Wang and Siwei Ma", title = "Joint Structure-Texture Scan-Order for Point Cloud Attribute Compression Using Affine Transformation", journal = j-TOMM, volume = "21", number = "6", pages = "173:1--173:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729232", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Existing geometry-based Point Cloud Compression (PCC) frameworks are typically designed to code the geometric coordinates first, followed by compressing the attributes (e.g., colors, reflectances) according to the order derived from geometric structures, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "173", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:DTD, author = "Yuanzhou Huang and Songwei Pei and Rui Zeng", title = "{DQFormer}: Transformer with Decoupled Query Augmentations for End-to-End Multi-Object Tracking", journal = j-TOMM, volume = "21", number = "6", pages = "174:1--174:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735510", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent online Transformer-based multi-object tracking methods achieve end-to-end optimization by jointly performing detection and association. However, these trackers apply query augmentations uniformly to detect queries and track queries during training,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "174", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lyu:2025:TTP, author = "Jiahao Lyu and Jin Wei and Gangyan Zeng and Zeng Li and Enze Xie and Wei Wang and Can Ma and Yu Zhou", title = "{TextBlockV2}: Towards Precise-Detection-Free Scene Text Spotting with Pre-trained Language Model", journal = j-TOMM, volume = "21", number = "6", pages = "175:1--175:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734872", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Existing scene text spotters are designed to locate and transcribe texts from images. However, it is challenging for a spotter to achieve precise detection and recognition of scene texts simultaneously. Inspired by the glimpse-focus spotting pipeline of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "175", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:AAL, author = "Yang-Hao Zhou and Heyan Huang and Cunhan Guo and Rong-Cheng Tu and Zeyu Xiao and Bo Wang and Xian-Ling Mao", title = "{ALOHA}: Adapting Local Spatio-Temporal Context to Enhance the Audio-Visual Semantic Segmentation", journal = j-TOMM, volume = "21", number = "6", pages = "176:1--176:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735975", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Audio-Visual Semantic Segmentation (AVSS) plays a crucial role in pixel-level multi-modal perception for real-world applications such as robotic navigation and autonomous driving. Existing methods typically rely on global spatio-temporal modules to fuse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "176", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:HFI, author = "Bing Yang and Xueqin Xiang and Wanzeng Kong and Jianhai Zhang and Jinliang Yao", title = "Hybrid Feature Integrated Transformer for {3D} Hand Reconstruction from a Single {RGB} Image", journal = j-TOMM, volume = "21", number = "6", pages = "177:1--177:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734873", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Reconstructing a 3D hand from a single RGB image is a very challenging task. Most of the existing Transformer-based 3D hand reconstructing methods do not fully consider the local spatial information from low-level image features, which would be crucial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "177", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xian:2025:DDT, author = "Weizhi Xian and Junyi Wang and Xuekai Wei and Jielu Yan and Yueting Huang and Kunyin Guo and Weijia Jia and Mingliang Zhou", title = "{DTSD}: a Dual Teacher-Student-Based Discrimination Model for Anomaly Detection", journal = j-TOMM, volume = "21", number = "6", pages = "178:1--178:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736725", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The rapid development of computer vision technology for detecting anomalies in industrial products has received unprecedented attention. In this article, we propose a dual teacher-student-based discrimination (DTSD) model for anomaly detection, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "178", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:ACG, author = "Jili Chen and Qionghao Huang and Changqin Huang and Xiaodi Huang", title = "Actual Cause-Guided Adaptive Gradient Scaling for Balanced Multimodal Sentiment Analysis", journal = j-TOMM, volume = "21", number = "6", pages = "179:1--179:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736415", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multimodal sentiment analysis leverages information from multiple sensors to achieve a comprehensive interpretation of emotions. However, different modalities do not always boost each other as expected. They compete with each other, leading to some \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "179", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fan:2025:GHQ, author = "Bing Fan and Feng Ding and Guopu Zhu and Jiwu Huang and Sam Kwong and Pradeep K. Atrey and Siwei Lyu", title = "Generating Higher-Quality Anti-Forensics {DeepFakes} with Adversarial Sharpening Mask", journal = j-TOMM, volume = "21", number = "6", pages = "180:1--180:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729233", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "DeepFake, an AI technology that can automatically synthesize facial forgeries, has recently attracted worldwide attention. While DeepFakes can be entertaining, they can also be used to spread falsified information or be weaponized as cognition warfare. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "180", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:MPP, author = "Zhiyuan Liu and Qi Zou and Xixia Xu and Yanting Pei", title = "Multi-Person Pose Estimation with Feature Enhancement and Decoupling Based on Contrastive Learning", journal = j-TOMM, volume = "21", number = "6", pages = "181:1--181:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727984", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Most methods of multi-person pose estimation (MPPE) treat the human detection and keypoint localization separately. They need additional supervision like instance bounding boxes, or complex hand-crafted processes like RoI cropping or grouping. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "181", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:BMC, author = "Dongjun Liu and Weichen Dai and Honggang Liu and Hangjie Yi and Wanzeng Kong", title = "Brain-Machine Cross-Modal Alignment via Sample Relational Learning for Visual Classification", journal = j-TOMM, volume = "21", number = "6", pages = "182:1--182:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736416", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent works on visual classification tasks have leveraged EEG signals to provide additional supervisory information, further improving the performance of the models on natural images. However, previous methods often force machine models to directly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "182", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lee:2025:FSS, author = "Seongmin Lee and Jiwoo Kang and Sanghoon Lee", title = "{3D} Facial Shape Similarity with Deep Perceptual Representations", journal = j-TOMM, volume = "21", number = "6", pages = "183:1--183:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734874", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Jul 26 09:23:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Comparing different 3D shapes is challenging due to their irregularities. Motivated by the human visual system mechanism, where the entire 3D geometry is clearly perceived as a series of multiple projections, we propose a novel facial shape similarity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "183", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kansal:2025:IPR, author = "Kajal Kansal and Yongkang Wong and Mohan Kankanhalli", title = "Implications of Privacy Regulations on Video Surveillance Systems", journal = j-TOMM, volume = "21", number = "7", pages = "184:1--184:27", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706108", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Advanced video surveillance systems (VSS), which collect information of every individual who passes through a surveilled area, have become ubiquitous due to its utility for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "184", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:AVA, author = "Yu-Ao Wang and James She and Troy TianYu Lin and Kang Zhang", title = "{AI} Visual Art History: an Art Movement with Expanded Artistic Horizon", journal = j-TOMM, volume = "21", number = "7", pages = "185:1--185:16", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3726868", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The progression of AI technology has spurred a growing number of artists to engage in AI Art production. This trend has sparked a multifaceted societal discourse. Given the tight \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "185", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{ElSaddik:2025:UCM, author = "Abdulmotaleb {El Saddik} and Jamil Ahmad and Mustaqeem Khan and Saad Abouzahir and Wail Gueaieb", title = "Unleashing Creativity in the Metaverse: Generative {AI} and Multimodal Content", journal = j-TOMM, volume = "21", number = "7", pages = "186:1--186:43", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3713075", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The metaverse presents an emerging creative expression and collaboration frontier where generative artificial intelligence (GenAI) can play a pivotal role with its ability to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "186", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bentaleb:2025:SCO, author = "Abdelhak Bentaleb and May Lim and Sarra Hammoudi and Saad Harous and Roger Zimmermann", title = "Solutions, Challenges, and Opportunities in Volumetric Video Streaming: an Architectural Perspective", journal = j-TOMM, volume = "21", number = "7", pages = "187:1--187:35", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705321", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Volumetric video streaming technologies are the future of immersive media services such as virtual, augmented, and mixed-reality experiences. The challenges \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "187", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:CAL, author = "Miaohui Wang and Runnan Huang and Wuyuan Xie and Zhan Ma and Siwei Ma", title = "Compression Approaches for {LiDAR} Point Clouds and Beyond: a Survey", journal = j-TOMM, volume = "21", number = "7", pages = "188:1--188:31", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715916", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the widespread use of LiDAR sensors in autonomous driving, LiDAR point cloud compression (LPCC) plays an important role in effectively managing the storage, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "188", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:QAE, author = "Zicheng Zhang and Yingjie Zhou and Chunyi Li and Baixuan Zhao and Xiaohong Liu and Guangtao Zhai", title = "Quality Assessment in the Era of Large Models: a Survey", journal = j-TOMM, volume = "21", number = "7", pages = "189:1--189:31", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722559", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Quality assessment, which evaluates the visual quality level of multimedia experiences, has garnered significant attention from researchers and has evolved substantially through \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "189", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:IMC, author = "Haopeng Wang and Haiwei Dong and Abdulmotaleb {El Saddik}", title = "Immersive Multimedia Communication: State-of-the-Art on Extended Reality Streaming", journal = j-TOMM, volume = "21", number = "7", pages = "190:1--190:33", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721292", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Extended reality (XR) is rapidly advancing and poised to revolutionize content creation and consumption. In XR, users integrate various sensory inputs to form a cohesive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "190", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:WMA, author = "Hao Wu and Maha Abdallah and Yuanfang Chi and Lehao Lin and Wei Cai", title = "Web3 Multimedia Applications: Under the Impact of Decentralization", journal = j-TOMM, volume = "21", number = "7", pages = "191:1--191:38", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3725851", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In the Web3 ecosystem, multimedia applications exhibit significant potential by leveraging decentralization, regarded as the core spirit of Web3. This survey aims to provide a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "191", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rashed:2025:RPE, author = "Ammar Rashed and Shervin Shirmohammadi and Ihab Amer and Mohamed Hefeeda", title = "A Review of Player Engagement Estimation in Video Games: Challenges and Opportunities", journal = j-TOMM, volume = "21", number = "7", pages = "192:1--192:33", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722116", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This article presents a review on the process of estimating player engagement in video gaming. To stay ahead of their competitors in entertainment, game developers need \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "192", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:SFD, author = "Xin Wang and Ting Yu Tsai and Li Lin and Hui Guo and Shu Hu and Ming-Ching Chang and Pradeep K. Atrey and Siwei Lyu", title = "Spotting the Fakes: a Deep Dive into {GAN}-Generated Face Detection", journal = j-TOMM, volume = "21", number = "7", pages = "193:1--193:24", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3742786", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Generative Adversarial Networks (GANs) have enabled the creation of highly authentic facial images, which are increasingly used in deceptive social media profiles and other forms of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "193", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:EID, author = "Xinjie Zhang and Tenggan Zhang and Lei Sun and Jinming Zhao and Qin Jin", title = "Exploring Interpretability in Deep Learning for Affective Computing: a Comprehensive Review", journal = j-TOMM, volume = "21", number = "7", pages = "194:1--194:28", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723005", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep learning has shown impressive performance in affective computing, but its black-box characteristic limits the model's interpretability, posing a challenge to further \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "194", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:SPH, author = "Yuanding Zhou and Xinran Li and Cheng Xiong and Heng Yao and Chuan Qin", title = "A Survey of Perceptual Hashing for Multimedia", journal = j-TOMM, volume = "21", number = "7", pages = "195:1--195:28", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727880", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Perceptual hashing is a cutting-edge technique in the field of digital multimedia security, which maps the perceptual content of multimedia information to a fixed-length hash \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "195", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Min:2025:MFL, author = "Weiqing Min and Xingjian Hong and Yuxin Liu and Mingyu Huang and Ying Jin and Pengfei Zhou and Leyi Xu and Yilin Wang and Shuqiang Jiang and Yong Rui", title = "Multimodal Food Learning", journal = j-TOMM, volume = "21", number = "7", pages = "196:1--196:28", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715143", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Food-centered study has received more attention in the multimedia community for its profound impact on our survival, nutrition and health, pleasure, and enjoyment. Our \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "196", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2025:MIM, author = "Lei Gao and Kai Liu and Zheng Guo and Ling Guan", title = "Mathematics-Inspired Models: a Green and Interpretable Learning Paradigm for Multimedia Computing", journal = j-TOMM, volume = "21", number = "7", pages = "197:1--197:22", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The advances of machine learning (ML), and AI in general, have attracted unprecedented attention in intelligent multimedia computing and many other fields. However, due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "197", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Timmerer:2025:HAS, author = "Christian Timmerer and Hadi Amirpour and Farzad Tashtarian and Samira Afzal and Amr Rizk and Michael Zink and Hermann Hellwagner", title = "{HTTP} Adaptive Streaming: a Review on Current Advances and Future Challenges", journal = j-TOMM, volume = "21", number = "7", pages = "198:1--198:27", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736306", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Video streaming has evolved from push-based, broad-/multicasting approaches with dedicated hard-/software infrastructures to pull-based unicast schemes utilizing existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "198", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Uddin:2025:ICA, author = "Shah Muhammad Imtiyaj Uddin and Rashedul Islam Sumon and Md Ariful Islam Mozumder and Md Kamran Hussin Chowdhury and Tagne Poupi Theodore Armand and Hee Cheol Kim", title = "Innovations and Challenges of {AI} in Film: a Methodological Framework for Future Exploration", journal = j-TOMM, volume = "21", number = "7", pages = "199:1--199:55", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736724", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With a focus on producing a new AI-based filmmaking platform, this study explores the revolutionary impact of AI on the film industry. Because of how movies are envisioned, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "199", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Telili:2025:CHP, author = "Ahmed Telili and Wassim Hamidouche and Hadi Amirpour and Sid Ahmed Fezza and Christian Timmerer and Luce Morin", title = "Convex Hull Prediction Methods for Bitrate Ladder Construction: Design, Evaluation, and Comparison", journal = j-TOMM, volume = "21", number = "7", pages = "200:1--200:23", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723006", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "HTTP adaptive streaming (HAS) has emerged as a prevalent approach for over-the-top (OTT) video streaming services due to its ability to deliver a seamless user experience. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "200", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:TKP, author = "Jiaqi Wang and Ricky Y.-K. Kwok and Edith C. H. Ngai", title = "Towards Key Point Identification ({KPI}) for Lecture Videos: Approaches and Performance Evaluation", journal = j-TOMM, volume = "21", number = "7", pages = "201:1--201:23", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746640", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "To maximize the utility of lecture videos, in today's fast-paced society with dwindling attention spans, various e-learning technologies are introduced, e.g., non-linear \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "201", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Du:2025:SCI, author = "Longye Du and Shuaiyu Deng and Ying Li and Jun Li and Qi Tian", title = "A Survey on Composed Image Retrieval", journal = j-TOMM, volume = "21", number = "7", pages = "202:1--202:27", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3723879", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Composed Image Retrieval (CIR) processes a query consisting of a reference image and a modification text, aiming to retrieve target images that not only resemble the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "202", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Vahdati:2025:MRW, author = "Vahdati, Monireh (Monica) and Fedwa Laamarti and Abdulmotaleb {El Saddik}", title = "Meta-Review of Wearable Devices for Healthcare in the Metaverse", journal = j-TOMM, volume = "21", number = "7", pages = "203:1--203:36", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705320", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recent years, there has been a growing interest in leveraging the metaverse to enhance community engagement and healthcare. This article provides a comprehensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "203", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shao:2025:TRV, author = "Xuan Shao and Lin Zhang and Tianjun Zhang and Shengjie Zhao", title = "Towards a Robust Visual-Inertial-Surround-View {SLAM} System for Autonomous Indoor Parking", journal = j-TOMM, volume = "21", number = "7", pages = "204:1--204:23", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3742433", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "An autonomous parking system is a low-speed unmanned driving system applied in indoor parking environments. Real-time and high-precision vehicle localization and map \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "204", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:GGA, author = "Zongsheng Cao and Qianqian Xu and Zhiyong Yang and Yuan He and Xiaochun Cao and Qingming Huang", title = "{GAHE}: Geometry-Aware Embedding for Hyper-Relational Knowledge Graph Representation", journal = j-TOMM, volume = "21", number = "7", pages = "205:1--205:26", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3733602", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Knowledge graphs have proven highly effective for learning representations of entities and relations, with hyper-relational knowledge graphs (HKGs) gaining increased \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "205", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fang:2025:CLD, author = "Jiajie Fang and Mengjuan Jiang and Jiaqing Fan and Bangjun Wang and Fanzhang Li", title = "Complementarily Learning Decoupled Category-Region-Aware Prototype for Few-Shot Classification", journal = j-TOMM, volume = "21", number = "7", pages = "206:1--206:22", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3737645", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Open-world few-shot classification is restricted by inadequate image-level content representation capabilities when the training and testing sets have significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "206", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:SSC, author = "Zheng Liu and Kunyu Yang and Yu Weng and Zheng He and Xuan Liu and Honghao Gao", title = "{SCAG}: Semantic Co-occurring Attention Guided Alignment for Knowledge-based Visual Question Answering", journal = j-TOMM, volume = "21", number = "7", pages = "207:1--207:20", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3734220", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In the realm of Knowledge-based Visual Question Answering (KB-VQA), the intricacy of the task lies in adeptly retrieving pertinent information from external sources and seamlessly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "207", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:MVP, author = "Weiyu Wang and Chunmei Qing and Junpeng Tan and Xiangmin Xu", title = "Multi-view Panoramic Image Style Transfer with Multi-scale Attention and Global Sharing", journal = j-TOMM, volume = "21", number = "7", pages = "208:1--208:19", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Style transfer for panoramic images is a challenging task, due to the problems associated with its unique structure, including edge discontinuities, pole distortion, fuzzy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "208", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:HOA, author = "Lu Zhang and Rui Yao and Yuhong Zhang and Yong Zhou and Fuyuan Hu and Jiaqi Zhao and Zhiwen Shao", title = "Historical Object-Aware Prompt Learning for Universal Hyperspectral Object Tracking", journal = j-TOMM, volume = "21", number = "7", pages = "209:1--209:20", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736581", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Hyperspectral Object Tracking (HOT), utilizing rich spectral information from hyperspectral video (HSV), holds significant importance for object tracking. We identify that a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "209", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Aoun:2025:MBL, author = "Alain Aoun and Mahmoud Masadeh and Sofi{\`e}ne Tahar", title = "{ML}-based Load Value Approximator for Efficient Multimedia Processing", journal = j-TOMM, volume = "21", number = "7", pages = "210:1--210:18", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736582", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Approximate computing (AC) has gained traction as an alternative computing method for energy-efficient processing. This article proposes the exploitation of AC to address the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "210", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2025:AHM, author = "Fubin Guo and Qi Wang and Qingshan Wang and Sheng Chen", title = "Accurate Hand Modeling in Whole-Body Mesh Reconstruction Using Joint-Level Features and Kinematic-Aware Topology", journal = j-TOMM, volume = "21", number = "7", pages = "211:1--211:23", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743138", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Whole-body mesh reconstruction utilizes neural networks to reconstruct the 3D human body, face, and hands, forming a fundamental task in computer vision. It is used to model \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "211", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yu:2025:TTG, author = "Zhipeng Yu and Zimeng Zhao and Yanxi Du and Yuzhou Zheng and Binghui Zuo and Yangang Wang", title = "{T2C}: Text-guided {4D} Cloth Generation", journal = j-TOMM, volume = "21", number = "7", pages = "212:1--212:19", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735642", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In the age of AIGC, the creation process is increasingly automated. Generating vivid characters with clothing and motions according to scripts or novels is no exception. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "212", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:ANN, author = "Yue Li and Junru Li and Chaoyi Lin and Kai Zhang and Li Zhang and Franck Galpin and Thierry Dumas and Hongtao Wang and Muhammed Coban and Jacob Str{\"o}m and Du Liu and Kenneth Andersson", title = "Advanced Neural Network-Based Video Coding Technologies for Intra Prediction and In-Loop Filtering", journal = j-TOMM, volume = "21", number = "7", pages = "213:1--213:23", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3733108", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The past decade has witnessed the huge success of deep learning in well-known artificial intelligence applications such as face recognition, autonomous driving, and large \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "213", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Griwodz:2025:ISI, author = "Carsten Griwodz and Mea Wang and Roger Zimmermann", title = "Introduction to the Special Issue on {MMSys 2023} and {NOSSDAV 2023}", journal = j-TOMM, volume = "21", number = "9", pages = "244:1--244:4", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722560", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "244", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:EEB, author = "Na Li and Zichen Zhu and Sheng Wei and Yao Liu", title = "{EVASR}: Edge-Based Salience-Aware Super-Resolution for Enhanced Video Quality and Power Efficiency", journal = j-TOMM, volume = "21", number = "9", pages = "245:1--245:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711928", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid growth of video content consumption, it is important to deliver high-quality streaming videos to users even under limited available network bandwidth. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "245", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kimura:2025:EAV, author = "Bruno Kimura and Simone Ferlin and Thomas Paiva and Toktam Mahmoodi and Anna Brunstrom and Ozgu Alay", title = "Evaluating Adaptive Video Streaming over Multipath {QUIC} with Shared Bottleneck Detection", journal = j-TOMM, volume = "21", number = "9", pages = "246:1--246:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711862", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The promises of multipath transport are to aggregate bandwidth, improve resource utilisation and enhance reliability. In this article, we demonstrate that the way \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "246", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gokarn:2025:RMR, author = "Ila Gokarn and Yigong Hu and Tarek Abdelzaher and Archan Misra", title = "{RA-MOSAIC}: Resource Adaptive Edge {AI} Optimization over Spatially Multiplexed Video Streams", journal = j-TOMM, volume = "21", number = "9", pages = "247:1--247:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715133", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sustaining real-time, high-fidelity AI-based vision perception on edge devices is challenging due to both the high computational overhead of increasingly ``deeper'' Deep Neural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "247", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:SST, author = "Jiaxi Li and Jingwei Liao and Bo Chen and Anh Nguyen and Aditi Tiwari and Qian Zhou and Zhisheng Yan and Klara Nahrstedt", title = "{ST-360}: Spatial-Temporal Filtering-Based Low-Latency 360-Degree Video Analytics Framework", journal = j-TOMM, volume = "21", number = "9", pages = "248:1--248:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3694685", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent advances in computer vision algorithms and video streaming technologies have facilitated the development of edge-server-based video analytics systems, enabling them to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "248", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Araujo:2025:DVP, author = "Gabriel de Castro Ara{\'u}jo and Henrique Domingues Garcia and Myl{\`e}ne C. Q. Farias and Ravi Prakash and Marcelo M. Carvalho", title = "A 360-degree Video Player for Dynamic Video Editing Applications", journal = j-TOMM, volume = "21", number = "9", pages = "249:1--249:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715135", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "360-degree videos introduce unique challenges to storytelling, especially if filmmakers want to draw the user's attention to specific regions of the surrounding sphere to ensure the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "249", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rudolph:2025:TVP, author = "Michael Rudolph and Stefan Schneegass and Amr Rizk", title = "Transcoding {V-PCC} Point Cloud Streams in Real-time", journal = j-TOMM, volume = "21", number = "9", pages = "250:1--250:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3682062", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Dynamic Point Clouds are a representation for three-dimensional (3D) immersive media that allows users to freely navigate a scene while consuming the content. However, this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "250", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fang:2025:SML, author = "Hao Fang and Haoyuan Zhao and Feng Wang and Yi Ching Chou and Long Chen and Jianxin Shi and Jiangchuan Liu", title = "Streaming Media over {LEO} Satellite Networking: a Measurement-Based Analysis and Optimization", journal = j-TOMM, volume = "21", number = "9", pages = "251:1--251:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3694976", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, Low Earth orbit Satellite Networks (LSNs) have been suggested as a critical and promising component toward high-bandwidth and low-latency global coverage in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "251", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{AMEUR:2025:SFS, author = "Zoubida AMEUR and Claire-H{\'e}l{\`e}ne Demarty and Olivier Le Meur and Daniel M{\'e}nard", title = "{Style-FG}: a Style-based Framework for Film Grain Analysis and Synthesis", journal = j-TOMM, volume = "21", number = "9", pages = "252:1--252:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712592", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Film grain which used to be a by-product of the chemical processing in the analog film stock is a desirable feature in the era of digital cameras. Besides participating to the artistic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "252", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Abreu:2025:AUE, author = "Raphael Abreu and Joel dos Santos and Gheorghita Ghinea and D{\'e}bora C. Muchaluat-Saade", title = "Assessing Usefulness, Ease of Use, and Recognition Performance of Semi-Automatic Mulsemedia Authoring", journal = j-TOMM, volume = "21", number = "9", pages = "253:1--253:19", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689640", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Mulsemedia (Multiple Sensorial Media) authoring poses a considerable challenge as authors navigate the intricate task of identifying moments to activate sensory effects \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "253", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Rossi:2025:CAU, author = "Silvia Rossi and Irene Viola and Laura Toni and Pablo Cesar", title = "A Clustering Approach to Unveil User Similarities in 6 df Extended Reality Applications", journal = j-TOMM, volume = "21", number = "9", pages = "254:1--254:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701734", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The advent in our daily life of Extended Reality (XR) technologies, such as Virtual and Augmented Reality, has led to the rise of user-centric systems, offering higher level \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "254", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{John:2025:MCF, author = "Vijay John and Yasutomo Kawanishi", title = "Multimodal Cascaded Framework with Multimodal Latent Loss Functions Robust to Missing Modalities", journal = j-TOMM, volume = "21", number = "9", pages = "255:1--255:21", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711860", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Despite interest in multimodal classification, few studies have addressed the missing modality problem in which an incomplete multimodal input with one or more missing modalities is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "255", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lee:2025:ACV, author = "Kuan-Yu Lee and Ashutosh Singla and Pablo Cesar and Cheng-Hsin Hsu", title = "Adaptive Cloud {VR} Gaming Optimized by Gamer {QoE} Models", journal = j-TOMM, volume = "21", number = "9", pages = "256:1--256:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3680551", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cloud Virtual Reality (VR) gaming offloads computationally intensive VR games to resourceful data centers. However, ensuring good Quality of Experience (QoE) in cloud VR \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "256", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:PCD, author = "Yuqing Yang and Anh Nguyen and Zhisheng Yan", title = "A Patch Can Disrupt Live Video Streaming: Physical Adversarial Attacks on Deep Learning Compression", journal = j-TOMM, volume = "21", number = "9", pages = "257:1--257:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750047", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep learning (DL)-based compression has achieved outstanding performance compared to traditional compression. However, due to the vulnerability of adversarial attacks on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "257", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qu:2025:AHL, author = "Xiaoye Qu and Qiyuan Chen and Wei Wei and Jiashuo Sun and Daizong Liu and Jianfeng Dong", title = "Alleviating Hallucination in Large Vision-Language Models with Active Retrieval Augmentation", journal = j-TOMM, volume = "21", number = "9", pages = "258:1--258:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3742434", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Despite the remarkable ability of Large Vision-Language Models (LVLMs) in image comprehension, these models frequently generate plausible yet factually \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "258", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:SCD, author = "Bing Liu and Wenjie Yang and Mingming Liu and Hao Liu and Yong Zhou and Peng Liu", title = "Syntactic-Conditional Diffusion Networks for Controllable Image Captioning", journal = j-TOMM, volume = "21", number = "9", pages = "259:1--259:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748653", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current diffusion model-based image captioning methods generally focus on generating descriptions in a non-autoregressive manner. Nevertheless, it is not trivial to employ \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "259", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:BPD, author = "Liyong Xu and Yifan Jiao and Bing-Kun Bao", title = "Bool Prompt with Decomposition and Enhancement: Zero-Shot {VQA} Based on {PVLMs}", journal = j-TOMM, volume = "21", number = "9", pages = "260:1--260:21", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744343", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Zero-Shot Visual Question Answering (ZSVQA) aims to answer questions about images without prior training on explicit image question pairs. Most existing methods usually apply \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "260", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:MMR, author = "Pengyu Li and Cheolkon Jung", title = "{MRFGNet}: Multiscale Reference Frame Generation Network for {VVC} Inter-Coding", journal = j-TOMM, volume = "21", number = "9", pages = "261:1--261:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750049", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Since the quality of the reference frames is critical for VVC inter-coding, the neural network (NN)-based reference frame generation aims to generate a better quality reference \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "261", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xia:2025:SIA, author = "Guiyu Xia and Zhedong Jin and Dongdong Fang and Yubao Sun", title = "Source Information-Assisted {UV}-Space Transformation Network for Person Image Generation", journal = j-TOMM, volume = "21", number = "9", pages = "262:1--262:16", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3749375", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Person image generation is widely used in many fields, but it still faces some challenges. Most of current person image generation methods suffer from an intractable problem of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "262", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:MFI, author = "Junle Liu and Yun Zhang and Zixi Guo and Xiaoxia Huang and Gangyi Jiang", title = "Multiscale Feature Importance-Based Bit Allocation for End-to-End Feature Coding for Machines", journal = j-TOMM, volume = "21", number = "9", pages = "263:1--263:19", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748654", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Feature Coding for Machines (FCM) aims to compress intermediate features effectively for remote intelligent analytics, which is crucial for future intelligent visual applications. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "263", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ji:2025:ITS, author = "Hefeng Ji and Jing Xiao and Jiefan Lin and Jimin Liu and Haoyong Yu", title = "Intelligent Tumor Synthesis Based on Medical Image Knowledge for Liver Tumor Segmentation", journal = j-TOMM, volume = "21", number = "9", pages = "264:1--264:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3749104", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Accurate segmentation of liver tumors is crucial for their proper diagnosis and treatment. However, achieving high levels of precision typically depends on meticulous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "264", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ding:2025:VIP, author = "Hao Ding and Jing Sun and Rui Long and Xiaoping Jiang and Hongling Shi and Yuting Qin and Zongze Li and Jian-Jin Li", title = "Visible-Infrared Person Re-Identification Based on Feature Decoupling and Refinement", journal = j-TOMM, volume = "21", number = "9", pages = "265:1--265:16", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3749843", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The objective of visible-infrared person re-identification is to accurately match pedestrian images captured in different modalities. Since these images are taken from varying \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "265", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pathak:2025:GRT, author = "Sanhita Pathak and Vinay Kaushik and Brejesh Lall", title = "Garment Recycle Training and Conditional Garment-Person Outline Attention-Guided Virtual Tryon", journal = j-TOMM, volume = "21", number = "9", pages = "266:1--266:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3758098", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Virtual try-on, a significant application in computer vision, aims to seamlessly simulate the appearance of clothing on a person from a single image. We propose a diffusion-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "266", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:MEA, author = "Zishan Xu and Xiaofeng Zhang and Yuqing Yang and Wei Chen and Jueting Liu and Tingting Xu and Zehua Wang and Abdulmotaleb {El Saddik}", title = "{MuralAgent}: Enhancing Ancient Mural Outpainting with {RAG}-Based Texts and Multimodal Integration", journal = j-TOMM, volume = "21", number = "9", pages = "267:1--267:17", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743679", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In the context of the digital age, utilizing cutting-edge technology for the digitization and creative expansion of ancient murals is crucial, aimed at preserving and passing on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "267", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:FFC, author = "Hanzhang Wang and Haoran Wang and Zhongrui Yu and Mingming Sun and Junjun Jiang and Xianming Liu and Deming Zhai", title = "{FAST}: Flexibly Controllable Arbitrary Style Transfer via Latent Diffusion Models", journal = j-TOMM, volume = "21", number = "9", pages = "268:1--268:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748655", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The goal of Arbitrary Style Transfer (AST) is injecting the artistic features of a style reference into a given image/video. Existing methods usually pursue the balance between \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "268", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:BMD, author = "Zhichao Zhang and Wei Sun and Li Xinyue and Jun Jia and Xiongkuo Min and Zicheng Zhang and Chunyi Li and Zijian Chen and Wang Puyi and Sun Fengyu and Jui Shangling and Guangtao Zhai", title = "Benchmarking Multi-dimensional {AIGC} Video Quality Assessment: a Dataset and Unified Model", journal = j-TOMM, volume = "21", number = "9", pages = "269:1--269:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3749844", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In recent years, AI-driven video generation has gained significant attention due to great advancements in visual and language generative techniques. Consequently, there is a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "269", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:AAV, author = "Bowen Huang and Yanwei Zheng and Chuanlin Lan and Dongchen Sui and Xinpeng Zhao and Xiao Zhang and Mengbai Xiao and Dongxiao Yu", title = "Action-Aware Visual-Textual Alignment for Long-Instruction Vision-and-Language Navigation", journal = j-TOMM, volume = "21", number = "9", pages = "270:1--270:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748656", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Traditional Vision-and-Language Navigation (VLN) requires an agent to navigate to a target location solely based on visual observations, guided by natural language \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "270", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lu:2025:KDP, author = "Chenyang Lu and Zhikai Wei and Huapeng Wu and Le Sun and Tianming Zhan", title = "{KANformer}: Dual-Priors-Guided Low-Light Enhancement via {KAN} and Transformer", journal = j-TOMM, volume = "21", number = "9", pages = "271:1--271:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750732", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Images captured under low-light conditions suffer from poor visibility and clarity due to insufficient light. The emergence of deep learning has greatly boosted the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "271", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2025:ULT, author = "Xiang Guo and Ruimin Hu and Dong Liang Zhu and Mei Wang", title = "Uniform Light Transformer for Person Re-identification under Complex Illumination", journal = j-TOMM, volume = "21", number = "9", pages = "272:1--272:18", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3745786", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The quality of pedestrian image retrieval is affected by the difference in illumination between images. Previous studies have used one-to-one lighting transformers to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "272", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:LLP, author = "Xin Liu and Qiya Song and Lin Xiao and Chun Wang and Xieping Gao", title = "{LPIC}: Learnable Prompts and {ID}-guided Contrastive Learning for Multimodal Recommendation", journal = j-TOMM, volume = "21", number = "9", pages = "273:1--273:16", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735561", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Thu Oct 2 11:38:10 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Multimodal recommendation systems improve the accuracy of recommendations by integrating information from different modalities to obtain potential representations of users and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "273", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Falcon:2025:ISI, author = "Alex Falcon and Giuseppe Serra and Sergio Escalera and Michael Wray", title = "Introduction to the Special Issue on Text-Multimedia Retrieval: Retrieving Multimedia Data by Means of Natural Language", journal = j-TOMM, volume = "21", number = "10", pages = "274:1--274:4", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750451", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "274", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ge:2025:FGA, author = "Shiping Ge and Zhiwei Jiang and Yafeng Yin and Cong Wang and Zifeng Cheng and Qing Gu", title = "Fine-Grained Alignment Network for Zero-Shot Cross-Modal Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "275:1--275:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722223", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Zero-Shot Cross-Modal Retrieval (ZS-CMR) aims to perform cross-modal retrieval on data of unseen classes, where a key challenge is how to address the modality-gap and domain-shift problems simultaneously. Existing methods tackle this challenge mainly by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "275", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:CIP, author = "Suyi Li and Chenyi Jiang and Shidong Wang and Yang Long and Zheng Zhang and Haofeng Zhang", title = "Contextual Interaction via Primitive-based Adversarial Training for Compositional Zero-shot Learning", journal = j-TOMM, volume = "21", number = "10", pages = "276:1--276:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712596", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Compositional Zero-shot Learning (CZSL) aims to identify novel compositions via known attribute-object pairs. The primary challenge in CZSL tasks lies in the significant discrepancies introduced by the complex interaction between the visual primitives of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "276", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:MMH, author = "Ying Li and Yuxiang Ding", title = "{MoHGCN}: Momentum Hypergraph Convolution Network for Cross-modal Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "277:1--277:21", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735135", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cross-modal retrieval tasks, encompassing the retrieval of image-text, video-audio, and more, are progressively gaining significance in response to the exponential growth of information on the Internet. However, there has always been a cloud hanging over \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "277", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2025:LVS, author = "Suncheng Xiang and Jingsheng Gao and Mingye Xie and Mengyuan Guan and Jiacheng Ruan and Yuzhuo Fu", title = "Learning Visual-Semantic Embedding for Generalizable Person Re-Identification: a Unified Perspective", journal = j-TOMM, volume = "21", number = "10", pages = "278:1--278:17", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3726528", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Generalizable person Re-Identification (Re-ID) is a very hot research topic in machine learning and computer vision, which plays a significant role in realistic scenarios due to its various applications in public security and video surveillance. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "278", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2025:RII, author = "Renjie Pan and Hua Yang and Xiangyu Zhao", title = "{ReAL}: Improving Image-Text Retrieval with Authentic Negative Repository Learning", journal = j-TOMM, volume = "21", number = "10", pages = "279:1--279:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729172", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current methods for image-text retrieval commonly propose various fusion modules to achieve robust visual-textual alignment, primarily relying on in-batch learning to guide the matching process. Some follow-up methods seek to enlarge the number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "279", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:MSF, author = "Shunxiang Zhang and Jiajia Liu and Yixuan Jiao and Yulei Zhang and Lei Chen and Kuanching Li", title = "A Multimodal Semantic Fusion Network with Cross-Modal Alignment for Multimodal Sentiment Analysis", journal = j-TOMM, volume = "21", number = "10", pages = "280:1--280:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744648", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "User-generated multimodal data can provide powerful sentiment clues for sentiment analysis task. Existing works have aligned common sentiment features in different modalities through various multimodal fusion methods. However, these works have certain \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "280", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ergasti:2025:MPM, author = "Alex Ergasti and Tomaso Fontanini and Claudio Ferrari and Massimo Bertozzi and Andrea Prati", title = "{MARS}: Paying More Attention to Visual Attributes for Text-Based Person Search", journal = j-TOMM, volume = "21", number = "10", pages = "281:1--281:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721482", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Text-Based Person Search (TBPS) is a problem that gained significant interest within the research community. The task is that of retrieving one or more images of a specific individual based on a textual description. The multi-modal nature of the task \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "281", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nishimura:2025:VLM, author = "Taichi Nishimura and Shota Nakada and Masayoshi Kondo", title = "Vision-Language Models Learn Super Images for Efficient Partially Relevant Video Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "282:1--282:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708349", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose an efficient and high-performance method for partially relevant video retrieval. The method aims to retrieve long videos that contain at least one moment relevant to the input text query. The challenge lies in encoding dense \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "282", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:MSC, author = "Liming Xu and Hanqi Li and Jie Shao and Xianhua Zeng and Weisheng Li", title = "Multi-scale Consistency Deep Lifelong Cross-modal Hashing", journal = j-TOMM, volume = "21", number = "10", pages = "283:1--283:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3704636", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep cross-modal hashing methods provide effective and efficient solutions for large-scale cross-modal retrieval. However, existing cross-modal hashing methods fail to capture the dynamic changes of real-world data and suffer from serious performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "283", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2025:DDL, author = "Liming Xu and Dengping Zhao and Hanqi Li and Xianhua Zeng and Bochuan Zheng", title = "Deep Differential Lifelong Cross-modal Hashing for Stream Medical Data Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "284:1--284:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721432", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the explosive growth of stream medical multi-modal data, it is significant to develop an efficient cross-modal retrieval algorithm to achieve effective medical data search. Within it, deep cross-modal hashing which maps cross-modal data into low-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "284", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:MGA, author = "Qun Zhang and Chao Yang and Bin Jiang and Bolin Zhang", title = "Multi-Grained Alignment with Knowledge Distillation for Partially Relevant Video Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "285:1--285:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716388", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Partially Relevant Video Retrieval (PRVR) aims to accurately retrieve the most relevant video in response to a query from untrimmed videos. The analysis of video content can be done at three different granularities: frame-level, clip-level, and video-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "285", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2025:IIR, author = "Hongyi Zhu and Jia-Hong Huang and Yixian Shen and Stevan Rudinac and Evangelos Kanoulas", title = "Interactive Image Retrieval Meets Query Rewriting with Large Language and Vision Language Models", journal = j-TOMM, volume = "21", number = "10", pages = "286:1--286:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744910", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image search is a pivotal task in multi-media and computer vision, finding applications across diverse domains, ranging from internet search to medical diagnostics. Conventional image search systems operate by accepting textual or visual queries and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "286", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ehsani:2025:ETQ, author = "Sina Ehsani and Jian Liu", title = "Elevating Textual Question Answering with On-Demand Visual Augmentation", journal = j-TOMM, volume = "21", number = "10", pages = "287:1--287:25", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3729231", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Textual Question Answering (TQA) remains a formidable challenge, despite over a decade of research. The integration of transformer networks and external knowledge via pre-trained models has marked a significant advancement in TQA. Yet, a crucial element \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "287", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gragnaniello:2025:VFR, author = "Diego Gragnaniello and Antonio Greco and Carlo Sansone and Bruno Vento", title = "Video Fire Recognition Using Zero-Shot Vision-Language Models Guided by a Task-Aware Object Detector", journal = j-TOMM, volume = "21", number = "10", pages = "288:1--288:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721291", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Fire detection from images or videos has gained a growing interest in recent years due to the criticality of the application. Both reliable real-time detectors and efficient retrieval techniques, able to process large databases acquired by sensor \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "288", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Messina:2025:JDL, author = "Nicola Messina and Jan Sedmidubsky and Fabrizio Falchi and Tom{\'a}s Rebok", title = "Joint-Dataset Learning and Cross-Consistent Regularization for Text-to-Motion Retrieval", journal = j-TOMM, volume = "21", number = "10", pages = "289:1--289:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744565", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Pose-estimation methods enable extracting human motion from common videos in the structured form of 3D skeleton sequences. Despite great application opportunities, effective content-based access to such spatio-temporal motion data is a challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "289", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2025:LGV, author = "Jianbo Song and Hong Zhang and Yachun Feng and Hanyang Liu and Yifan Yang", title = "Language-guided Visual Tracking: Comprehensive and Effective Multimodal Information Fusion", journal = j-TOMM, volume = "21", number = "10", pages = "290:1--290:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3757322", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Current vision-language trackers often struggle to fuse multimodal information comprehensively and effectively, leading to suboptimal performance in multimodal tasks. This study introduces LGTrack, a novel language-guided visual tracking framework \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "290", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bhayana:2025:TCF, author = "Divya Arora Bhayana and Om Prakash Verma", title = "Trans-Convo-Former Net for Hierarchical Prediction of Household Images", journal = j-TOMM, volume = "21", number = "10", pages = "291:1--291:21", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3757323", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Image classification has become the backbone of computer vision in recent times. Hierarchical image classification has been a scarcely exploited field, particularly in household images. Although many convolution and transformer learning models have been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "291", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2025:LRR, author = "Xiaobo Hu and Youfang Lin and Jinwen Wang and Yue Liu and Shuo Wang and Hehe Fan and Kai Lv", title = "Learning Robust Representations via Bidirectional Transition for Visual Reinforcement Learning", journal = j-TOMM, volume = "21", number = "10", pages = "292:1--292:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3765517", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Visual reinforcement learning has exhibited efficacy in solving control tasks characterized by high-dimensional observations. However, a central challenge persists in deriving dependable and generalizable representations from vision-based observations. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "292", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:TBS, author = "Mingliang Zhou and Shuqi Han and Jun Luo and Xu Zhuang and Qin Mao and Zhengguo Li", title = "Transformer-Based and Structure-Aware Dual-Stream Network for Low-Light Image Enhancement", journal = j-TOMM, volume = "21", number = "10", pages = "293:1--293:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3758097", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we propose an end-to-end Transformer-based and structure-aware dual-stream network for low-light image enhancement. First, we divide the dual-stream network into a main stream and a structure stream. The main stream is used to recover an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "293", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:DBC, author = "Yuan Cao and Dong Wang", title = "Dual-Branch Cross-Layer Information Flow Network for Camouflaged Object Detection in Complex Scenes", journal = j-TOMM, volume = "21", number = "10", pages = "294:1--294:19", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764866", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Camouflaged Object Detection (COD) aims to accurately detect objects that blend in with their backgrounds. While deep learning-based methods have significantly improved detection accuracy, the persistent similarity in chromatic and textural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "294", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:ELD, author = "Haojie Li and Hao Chen and Yining Huang and Tianshui Chen and Shuangping Huang", title = "Enhancing Lip Dynamic Authenticity: Learning {3D} Temporal Representations for Talking Head Synthesis", journal = j-TOMM, volume = "21", number = "10", pages = "295:1--295:21", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750048", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Audio-driven talking head synthesis aims to generate lifelike facial animations synchronized with audio. Current approaches primarily focus on lip motion information in 2D visual space for lip-audio synchronization and expressive lip dynamic, often \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "295", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:PGS, author = "Zhili Zhou and Wensheng Zhang and Zhengdao Li and Huilin Ge and Bin Qiu and Fengjun Xiao and Yongfeng Huang", title = "Progressive Generative Steganography via High-Resolution Image Generation for Covert Communication", journal = j-TOMM, volume = "21", number = "10", pages = "296:1--296:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760531", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, as one of the most popular covert communication technologies, generative steganography has received ever-increasing attention due to its promising performance against sophisticated steganalysis tools. However, it is quite difficult for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "296", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:BTA, author = "Jingtian Wang and Xiaolong Li and Bin Ma and Yao Zhao", title = "Boosting Transferability of Adversarial Examples with Spatio-Temporal Context", journal = j-TOMM, volume = "21", number = "10", pages = "297:1--297:22", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3766545", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Transferable adversarial examples have received increasing attention for their utility in spoofing multiple models, but existing attacks still perform poorly in terms of transferability. In light of this, a novel attack method called Spatio-Temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "297", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Guo:2025:MMM, author = "Xu Guo and Tong Zhang and Fuyun Wang and Xudong Wang and Xiaoya Zhang and Xin Liu and Zhen Cui", title = "{MMHCL}: Multi-Modal Hypergraph Contrastive Learning for Recommendation", journal = j-TOMM, volume = "21", number = "10", pages = "298:1--298:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762665", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The burgeoning presence of multimodal content-sharing platforms propels the development of personalized recommender systems. Previous works usually suffer from data sparsity and cold-start problems and may fail to adequately explore semantic user-product \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "298", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Pan:2025:GNG, author = "Xiao Pan and Zongxin Yang and Shuai Bai and Yi Yang", title = "{GD-NeRF}: Generative Detail Compensation for One-shot Generalizable Neural Radiance Fields", journal = j-TOMM, volume = "21", number = "10", pages = "299:1--299:24", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748331", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In this article, we focus on the one-shot novel view synthesis task which targets synthesizing photo-realistic novel views given only one reference image per scene. Previous One-shot Generalizable Neural Radiance Field (OG-NeRF) methods solve this task \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "299", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yao:2025:CMT, author = "Jiacheng Yao and Jing Zhang and Shuying Zhang and Li Zhuo", title = "Cross-Modal Tri-Semantic Correlation-{CLIP} for Short Video Homogenization Recognition", journal = j-TOMM, volume = "21", number = "10", pages = "300:1--300:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762193", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Short videos are one of the most popular social media in the world, triggering a proliferation of copycat creations leading to homogenized video content, with visual and textual homogenization being the most prevalent. Unlike near-duplicate video \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "300", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shao:2025:MER, author = "Zhiwen Shao and Yifan Cheng and Fan Zhang and Xuehuai Shi and Canlin Li and Lizhuang Ma and Dit-Yan Yeung", title = "Micro-Expression Recognition via Fine-Grained Dynamic Perception", journal = j-TOMM, volume = "21", number = "10", pages = "301:1--301:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3765901", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Facial micro-expression recognition (MER) is a challenging task, due to the transience, subtlety, and dynamics of micro-expressions (MEs). Most existing methods resort to hand-crafted features or deep networks, in which the former often additionally \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "301", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:EIN, author = "Yue Liu and Zhangkai Ni and Peilin Chen and Shiqi Wang and Xinfeng Zhang and Hanli Wang and Sam Kwong", title = "{EIN}: Exposure-Induced Network for Single-Image {HDR} Reconstruction", journal = j-TOMM, volume = "21", number = "10", pages = "302:1--302:23", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3763240", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Reconstructing high dynamic range (HDR) images from standard dynamic range (SDR) ones has received growing attention in recent years. A predominant problem of this task lies in the absence of texture and structural information in under/over-exposed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "302", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ghorbanpour:2025:RRD, author = "Ali Ghorbanpour and Mohammad Amin Arab and Mohamed Hefeeda", title = "{RDIAS}: Robust and Decentralized Image Authentication System", journal = j-TOMM, volume = "21", number = "10", pages = "303:1--303:28", month = oct, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760260", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Sat Oct 25 07:23:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent AI tools can subtly manipulate images, eroding users' trust in the authenticity of images they see on their displays. Current image authentication methods either detect artifacts that may result from manipulations or attach hashes of images as \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "303", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fei:2025:ISI, author = "Hao Fei and Wei Ji and Yinwei Wei and Zhedong Zheng and Jialie Shen and Alan Hanjalic and Roger Zimmermann", title = "Introduction to the Special Issue on Deep Multimodal Generation and Retrieval", journal = j-TOMM, volume = "21", number = "11", pages = "304:1--304:13", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762666", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "304", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:CNC, author = "Haoran Li and Yanbin Hao and Jiarui Yu and Bin Zhu and Shuo Wang and Tong Xu", title = "{CVLP-NaVD}: Contrastive Visual-language Pre-training Models for Non-annotated Visual Description", journal = j-TOMM, volume = "21", number = "11", pages = "305:1--305:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708348", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "305", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:PCB, author = "Jianrong Zhang and Hehe Fan and Yi Yang", title = "Protein Captioning: Bridging the Gap between Protein Sequences and Natural Languages", journal = j-TOMM, volume = "21", number = "11", pages = "306:1--306:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705322", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "306", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:MEC, author = "Bobo Li and Hao Fei and Fei Li and Tat-Seng Chua and Donghong Ji", title = "Multimodal Emotion-Cause Pair Extraction with Holistic Interaction and Label Constraint", journal = j-TOMM, volume = "21", number = "11", pages = "307:1--307:19", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689646", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "307", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2025:MBM, author = "Tao Chen and Enwei Zhang and Yuting Gao and Ke Li and Xing Sun and Yan Zhang and Hui Li and Rongrong Ji", title = "{MMICT}: Boosting Multi-Modal Fine-Tuning with In-Context Examples", journal = j-TOMM, volume = "21", number = "11", pages = "308:1--308:17", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3688804", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "308", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:LOS, author = "Xiaojie Zhou and Hang Yu and Shengjie Yang and Jing Huo and Pinzhuo Tian", title = "Learning from Orthogonal Space with Multimodal Large Models for Generalized Few-shot Segmentation", journal = j-TOMM, volume = "21", number = "11", pages = "309:1--309:22", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712597", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "309", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:BMR, author = "Yu Liu and Haipeng Chen and Guihe Qin and Jincai Song and Xun Yang", title = "Bias Mitigation and Representation Optimization for Noise-Robust Cross-Modal Retrieval", journal = j-TOMM, volume = "21", number = "11", pages = "310:1--310:17", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3700596", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "310", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:AUP, author = "Peng Wang and Yongheng Zhang and Hao Fei and Qiguang Chen and Yukai Wang and Jiasheng Si and Wenpeng Lu and Min Li and Libo Qin", title = "{S 3} Agent: Unlocking the Power of {VLLM} for Zero-Shot Multi-Modal Sarcasm Detection", journal = j-TOMM, volume = "21", number = "11", pages = "311:1--311:16", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3690642", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "311", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gan:2025:EHF, author = "Yuan Gan and Ruijie Quan and Yawei Luo", title = "{ExpAvatar}: High-Fidelity Avatar Generation of Unseen Expressions with {3D} Face Priors", journal = j-TOMM, volume = "21", number = "11", pages = "312:1--312:21", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3700770", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "312", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cai:2025:PAP, author = "Haoyu Cai and Wenqi Lou and Chao Wang and Xuehai Zhou", title = "{Picasso}: Analyzing Prompt Design for Text-to-Image Generative Diffusion Models from a Temporal-Spatial Perspective", journal = j-TOMM, volume = "21", number = "11", pages = "313:1--313:24", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3724122", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "313", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2025:IDG, author = "Hongchen Wei and Zhenzhong Chen", title = "Improving Domain Generalization for Image Captioning with Unsupervised Prompt Learning", journal = j-TOMM, volume = "21", number = "11", pages = "314:1--314:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715136", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "314", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2025:CHQ, author = "Yiwei Ma and Yijun Fan and Jiayi Ji and Haowei Wang and Haibing Yin and Xiaoshuai Sun and Rongrong Ji", title = "Creating High-Quality {3D} Content by Bridging the Gap between Text-to-{2D} and Text-to-{3D} Generation", journal = j-TOMM, volume = "21", number = "11", pages = "315:1--315:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3687475", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "315", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shen:2025:TCR, author = "Fei Shen and Xiaoyu Du and Liyan Zhang and Xiangbo Shu and Jinhui Tang", title = "Triplet Contrastive Representation Learning for Unsupervised Vehicle Re-Identification", journal = j-TOMM, volume = "21", number = "11", pages = "316:1--316:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3695255", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "316", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Qu:2025:SFB, author = "Shilin Qu and Weiqing Wang and Xin Zhou and Haolan Zhan and Zhuang Li and Lizhen Qu and Linhao Luo and Yuan-Fang Li and Gholamreza Haffari", title = "Scalable Frame-Based Construction of Sociocultural Norm Bases for Socially Aware Dialogues", journal = j-TOMM, volume = "21", number = "11", pages = "317:1--317:17", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3697838", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "317", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cui:2025:CAC, author = "Kai Cui and Shenghao Liu and Wei Feng and Xianjun Deng and Liangbin Gao and Minmin Cheng and Hongwei Lu and Laurence T. Yang", title = "Correlation-Aware Cross-Modal Attention Network for Fashion Compatibility Modeling in {UGC} Systems", journal = j-TOMM, volume = "21", number = "11", pages = "318:1--318:24", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3698772", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "318", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tao:2025:MCS, author = "Zhulin Tao and Runze Zhao and Xin Shi and Xingyu Gao and Xi Wang and Xianglin Huang", title = "Multimodal Consistency Suppression Factor for Fake News Detection", journal = j-TOMM, volume = "21", number = "11", pages = "319:1--319:19", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3699959", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "319", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:SCP, author = "Yue Zhang and Chao Wang and Fei Fang and Yunzhi Zhuge and Hehe Fan and Xiaojun Chang and Cheng Deng and Yi Yang", title = "{SAMControl}: Controlling Pose and Object for Image Editing with Soft Attention Mask", journal = j-TOMM, volume = "21", number = "11", pages = "320:1--320:28", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702999", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "320", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Sun:2025:CGC, author = "Zhengwentai Sun and Yanghong Zhou and P. Y. Mok", title = "{CoDE-GAN}: Content Decoupled and Enhanced {GAN} for Sketch-guided Flexible Fashion Editing", journal = j-TOMM, volume = "21", number = "11", pages = "321:1--321:24", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712063", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "321", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Shen:2025:STA, author = "Leqi Shen and Sicheng Zhao and Yifeng Zhang and Pengzhang Liu and Yongjun Bao and Guiguang Ding", title = "Spatio-Temporal Attention for Text-Video Retrieval", journal = j-TOMM, volume = "21", number = "11", pages = "322:1--322:20", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715137", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "322", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jing:2025:SSH, author = "Weipeng Jing and Peilun Kang and Donglin Di and Juntao Gu and Linhui Li and Mahmoud Emam and Linda Mohaisen and Xun Yang and Chao Li", title = "{SRF}: {SpectrumRecombineFormer} for Hyperspectral Image Classification", journal = j-TOMM, volume = "21", number = "11", pages = "323:1--323:25", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715698", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "323", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:SED, author = "Yiming Wu and Qihe Pan and Zhen Zhao and Zicheng Wang and Sifan Long and Ronghua Liang", title = "{SOEDiff}: Efficient Distillation for Small Object Editing", journal = j-TOMM, volume = "21", number = "11", pages = "324:1--324:19", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715915", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "324", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dai:2025:DMS, author = "Licun Dai and Zhiming Luo and Yongguo Ling and Jiaxing Chai and Shaozi Li", title = "Dual-Modality-Shared Learning and Label Refinement for Unsupervised Visible-Infrared Person {ReID}", journal = j-TOMM, volume = "21", number = "11", pages = "325:1--325:24", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3724397", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "325", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Faisal:2025:HNP, author = "Mohd Faisal and Roberto Alejandro {Martinez Velazquez} and Fedwa Laamarti and Hussein {Al Osman} and Abdulmotaleb {El Saddik}", title = "Haptic Network Protocols: a Comprehensive Review and Directions for Next-Gen Metaverse Applications", journal = j-TOMM, volume = "21", number = "11", pages = "326:1--326:33", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3759459", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "326", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:AIV, author = "Yahui Wang and Mohsen Guizani and M. Shamim Hossain", title = "Artificial Intelligence for Virtual Reality: State of the Art, Challenges, and Future Perspectives", journal = j-TOMM, volume = "21", number = "11", pages = "327:1--327:29", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769090", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "327", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhong:2025:ETA, author = "Yuanhong Zhong and Ge Yan and Ruyue Zhu and Ping Gan and Xuerui Shen", title = "Early Traffic Accident Anticipation via Feature Consistency Representation and Soft Label Regression", journal = j-TOMM, volume = "21", number = "11", pages = "328:1--328:21", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767737", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "328", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fu:2025:VTC, author = "Hao Fu and Fengyu Yang and Boyang Wang and Wei Ji and Hanbin Zhao and Chao Zhang and Roger Zimmermann and Hui Qian", title = "Visuo-Tactile Class-Incremental Learning", journal = j-TOMM, volume = "21", number = "11", pages = "329:1--329:19", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3754452", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "329", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Fang:2025:AAD, author = "Dikai Fang and Huahu Xu and Yuzhe Huang and Honghao Gao", title = "{ADTC}: Adaptive Dual-Stage Tree Construction for Point-Supervised Video Moment Retrieval", journal = j-TOMM, volume = "21", number = "11", pages = "330:1--330:27", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744651", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "330", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kang:2025:UEV, author = "Jiaxu Kang and Bolei Chen and Ping Zhong and Yifei Wang and Haonan Yang and Yu Sheng", title = "Unbiased Embodied Visual Representation Learning with Causal Inference and Cross-Modality Alignment", journal = j-TOMM, volume = "21", number = "11", pages = "331:1--331:23", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760261", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "331", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xue:2025:LLI, author = "Minglong Xue and Jinhong He and Wenhai Wang and Mingliang Zhou", title = "Low-Light Image Enhancement via {CLIP-Fourier} Guided Wavelet Diffusion", journal = j-TOMM, volume = "21", number = "11", pages = "332:1--332:22", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764933", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "332", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2025:IIW, author = "Jing Li and Jun Guo and M. Shamim Hossain and Ning Yu", title = "Immersive Ink-and-Wash Landscape Design in Multimedia for Art Therapy", journal = j-TOMM, volume = "21", number = "11", pages = "333:1--333:26", month = nov, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762998", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Tue Dec 23 07:13:19 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "333", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Hu:2025:HOM, author = "Menghao Hu and Yaguang Song and Xiaoshan Yang and Yaowei Wang and Changsheng Xu", title = "Health-oriented Multimodal Food Question Answering with Implicit and Explicit Knowledge", journal = j-TOMM, volume = "21", number = "12", pages = "334:1--334:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3766065", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Health-oriented food analysis has become a research hotspot in recent years because it can help people keep away from unhealthy diets. Remarkable advancements have been made in recipe retrieval, food recommendation, nutrition analysis, and calorie \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "334", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ren:2025:CGM, author = "Peng Ren and Xiaoheng Li and Yunfeng Bai and Jinyuan Jia", title = "Correlation-guided Masked Autoencoder with Multimodal Contrastive Interaction on Point Clouds", journal = j-TOMM, volume = "21", number = "12", pages = "335:1--335:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770579", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Self-supervised learning has shown remarkable effectiveness in 3D point cloud understanding. Existing masked autoencoders or contrastive learning paradigms can acquire robust feature representations from unlabeled data. Specifically, masked autoencoders \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "335", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wen:2025:LDS, author = "Yang Wen and Shunzhe Shen and Wuzhen Shi and Wenming Cao and Lei Bi and Xiaokang Yang and Bin Sheng", title = "A Lightweight Depthwise Separable {ConvNet} with Frequency-domain Enhancement for Retinal Vessel Segmentation", journal = j-TOMM, volume = "21", number = "12", pages = "336:1--336:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767732", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Automatic retinal vessel segmentation is crucial in the diagnosis and treatment of various cardiovascular and eye diseases. Although current vessel segmentation methods have achieved impressive performance, some challenging issues still need to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "336", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wen:2025:CNC, author = "Yang Wen and Xiang-Ning Wang and Jixue Tang and Ping Li and Lei Zhu and Jing Qin and Xiaokang Yang and Bin Sheng", title = "{CCM-Net}: Contrastive and Consistent Multi-Task Network for Artifact Segmentation and Quality Classification of {OCTA} Images", journal = j-TOMM, volume = "21", number = "12", pages = "337:1--337:21", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3765745", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Artifacts are prevalent in Optical Coherence Tomography Angiography (OCTA) images, which probably interfere doctor's diagnosis and greatly limit its utility. Therefore, it is desirable to segment artifacts and assess quality when using them for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "337", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Bie:2025:MSR, author = "Lin Bie and Siqi Li and Xiaopin Zhong and Zongze Wu and Yue Gao", title = "Multi-space Representation Fusion Enhanced Monocular Depth Estimation via Virtual Point Cloud", journal = j-TOMM, volume = "21", number = "12", pages = "338:1--338:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770076", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Monocular Depth Estimation (MDE) is a fundamental problem in computer vision with broad applications in various downstream tasks. While recent studies focus on designing increasingly complex and powerful deep learning methods to regress depth maps \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "338", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chang:2025:FGS, author = "Tang-Chen Chang and Duen-Chian Jheng and Hsuan-Ya Liang and Bill Louis Harchan and Pu Ching and Tsung-Hsun Tsai and Chih-Yi Chang and Te-Cheng Wu and Yung-Hui Li and Tse-Yu Pan and Hung-Kuo Chu and Min-Chun Hu", title = "Fine-grained Stroke Recognition in Broadcast Table Tennis Videos with {ATDT}", journal = j-TOMM, volume = "21", number = "12", pages = "339:1--339:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769299", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study introduces an automated system for fine-grained stroke recognition in broadcast table tennis videos, designed to address challenges in manual annotation and tactical analysis during international competitions. The proposed framework integrates \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "339", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wong:2025:ECS, author = "Yi Jie Wong and Mau-Luen Tham and Ban-Hoe Kwan and Yoong Choon Chang and Anissa Mokraoui and Feng Ke", title = "Efficient Client Selection for Asynchronous Federated Learning for Adaptive Bitrate Streaming", journal = j-TOMM, volume = "21", number = "12", pages = "340:1--340:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3765759", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, Deep Reinforcement Learning (DRL) has been applied to enhance the Quality of Experience (QoE) of Adaptive Bitrate Streaming (ABR) by adjusting the video quality level in real time based on instantaneous network conditions. To build a state-of-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "340", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:OHD, author = "Xingbo Liu and Zhijie Zhao and Xuening Zhang and Xiao Kang and Xiushan Nie", title = "Online Hashing with Discriminative Attribute Embedding", journal = j-TOMM, volume = "21", number = "12", pages = "341:1--341:20", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746641", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Online hashing has emerged as a powerful tool for efficiently processing large-scale and streaming data. However, existing approaches often struggle with scalability limitations in similarity relations and inadequate discrimination provided by one-hot \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "341", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yuan:2025:IPG, author = "Zihan Yuan and Li Li and Zichi Wang and Xinpeng Zhang", title = "Integrity Protection of Generative Adversarial Networks Using Fragile Watermarking", journal = j-TOMM, volume = "21", number = "12", pages = "342:1--342:21", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744566", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deep learning has made remarkable achievements in the field of artificial intelligence. However, a well-trained deep neural network is at risk of being tampered with. Although some model watermarking schemes have been proposed to solve this problem, most \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "342", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2025:PTE, author = "Chongwei Liu and Haojie Li and Zhihui Wang and Rui Xu", title = "Is a Pure Transformer Effective for Separated and Online Multi-Object Tracking?", journal = j-TOMM, volume = "21", number = "12", pages = "343:1--343:21", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3749105", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent advances in multi-object tracking (MOT) have demonstrated significant success in short-term association within the separated tracking-by-detection online paradigm. However, long-term tracking remains challenging. While graph-based approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "343", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:CCE, author = "Mingming Zhang and Qingjie Liu and Yunhong Wang", title = "{CtxMIM}: Context-Enhanced Masked Image Modeling for Remote Sensing Image Understanding", journal = j-TOMM, volume = "21", number = "12", pages = "344:1--344:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769084", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Learning representations through self-supervision on unlabeled data has proven highly effective for understanding diverse images. However, remote sensing images often have complex and densely populated scenes with multiple land objects and no clear \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "344", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Ma:2025:DBP, author = "Wenzhuo Ma and Zhenzhong Chen", title = "Diffusion-based Perceptual Neural Video Compression with Temporal Diffusion Information Reuse", journal = j-TOMM, volume = "21", number = "12", pages = "345:1--345:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3761815", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, foundational diffusion models have attracted considerable attention in image compression tasks, whereas their application to video compression remains largely unexplored. In this article, we introduce DiffVC, a diffusion-based perceptual neural \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "345", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Milidonis:2025:DCC, author = "Xenios Milidonis and Alessandro Artusi and Francesco Banterle", title = "Deep Chroma Compression of Tone-Mapped Images", journal = j-TOMM, volume = "21", number = "12", pages = "346:1--346:17", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744925", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Acquisition of High Dynamic Range (HDR) images is thriving due to the increasing use of smart devices and the demand for high-quality output. Extensive research has focused on developing methods for reducing the luminance range in HDR images using \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "346", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wu:2025:BFB, author = "Jiesheng Wu and Fangwei Hao and Jing Xu", title = "Boosting Foreground-Background Disentanglement for Camouflaged Object Detection", journal = j-TOMM, volume = "21", number = "12", pages = "347:1--347:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3768584", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In nature, certain objects exhibit patterns that closely resemble their backgrounds, a phenomenon commonly referred to as Camouflaged Object Detection (COD). We argue that existing COD approaches often suffer from insufficient discriminability for these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "347", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:SCW, author = "Jinming Cao and Sicheng Shen and Qiu Zhou and Yifang Yin and Yangyan Li and Roger Zimmermann", title = "{ShapeMoir{\'e}}: Channel-Wise Shape-Guided Network for Image Demoir{\'e}ing", journal = j-TOMM, volume = "21", number = "12", pages = "348:1--348:20", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748657", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Photographing optoelectronic displays often introduces unwanted moir{\'e} patterns due to analog signal interference between the pixel grids of the display and the camera sensor arrays. This work identifies two problems that are largely ignored by existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "348", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2025:ALS, author = "Xinran Wang and Zhiqiang Tian and Lin Bie and Siqi Li and Dejian Guo and Shaoyi Du and Yue Gao", title = "Arbitrary Large-Scale Scene Reconstruction without Annotated Block Partitions", journal = j-TOMM, volume = "21", number = "12", pages = "349:1--349:19", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3768159", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Large-scale scene reconstruction is a challenging problem. As different parts of the scene could be visible from different collected image frames, previous works manually use distance or geography to decompose the scene into parts and reconstruct each \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "349", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:VVR, author = "Zijie Zhou and Mingliang Zhou and Jun Luo and Huayan Pu and Leong Hou U. and Xuekai Wei and Weijia Jia", title = "{VideoGNN}: Video Representation Learning via Dynamic Graph Modelling", journal = j-TOMM, volume = "21", number = "12", pages = "350:1--350:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760532", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Graphs offer a flexible structure for vision tasks, with CNNs and Transformers conditioned as two specific cases of graph structures. In CNNs, the input images are treated as graphs where only neighboring patches are connected, whereas Transformers view \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "350", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Huang:2025:CMR, author = "Shijuan Huang and Zongyi Li and Hefei Ling and Jianbo Li", title = "Cross-Modality Relation and Uncertainty Exploration for Text-Based Person Search", journal = j-TOMM, volume = "21", number = "12", pages = "351:1--351:20", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747185", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Text-based person search aims to retrieve specific individuals from an extensive image gallery using textual queries. Recent approaches have delved into aligning global and part features in both text and image modalities, yielding substantial \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "351", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhou:2025:MSH, author = "Huiyi Zhou and Feng Zhao and Chunhai Li", title = "Multi-scale Historical Trajectory Decomposition for Viewport Prediction in 360-degree Videos", journal = j-TOMM, volume = "21", number = "12", pages = "352:1--352:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760533", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "360-degree panoramic video provides users with an unprecedented immersive experience and is rapidly evolving with the support of virtualized devices. Effective viewport prediction is crucial for alleviating high-speed bandwidth constraints and enhancing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "352", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Yang:2025:RHR, author = "Chengmei Yang and Qian Li and Zhenyang Li and Chen Ma and Lianghua He", title = "{R-HMF}: a Relation-enhanced Hierarchical Multimodal Framework for Few-shot Knowledge Graph Completion", journal = j-TOMM, volume = "21", number = "12", pages = "353:1--353:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769865", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Knowledge graph completion (KGC), which aims at inferring the missing fact triples, has shown an essential role in constructing a complete knowledge graph to enhance downstream applications. However, most KGC techniques require a large number of labeled \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "353", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Cao:2025:UFI, author = "Xueyan Cao and Tao Lin and Liping Zhao and Wei Han and Shanshe Wang and Kailun Zhou and Yufen Yang", title = "Ultra-Fast Intra Screen Content Coding via Accelerated Re-Visit {CU}-Coding in {AVS3}", journal = j-TOMM, volume = "21", number = "12", pages = "354:1--354:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3748509", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Screen Content Coding (SCC) is an indispensable tool for enabling distributed collaboration, such as video conferencing. Encoders in the latest video coding standards, particularly for SCC scenarios, employ a wider variety of partitioning tree splitting \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "354", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2025:DER, author = "Ziyang Song and Ruijie Zhu and Jing Wang and Chuxin Wang and Jianfeng He and Jiacheng Deng and Wenfei Yang and Tianzhu Zhang", title = "{ER-Depth}: Enhancing the Robustness of Self-Supervised Monocular Depth Estimation in Challenging Scenes", journal = j-TOMM, volume = "21", number = "12", pages = "355:1--355:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750050", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Self-supervised monocular depth estimation holds significant importance in the fields of autonomous driving and robotics. However, existing methods are typically trained and evaluated on clear, sunny datasets, overlooking the impact of various adverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "355", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Song:2025:HIN, author = "Chuanle Song and Wei Zhou and Han Jiao and Wenjin Huang and Junfeng Li and Yihua Huang", title = "{HIN}: Hierarchical Interaction Network for Image Captioning", journal = j-TOMM, volume = "21", number = "12", pages = "356:1--356:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769866", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The purpose of the image captioning task is to understand the content of an image and generate corresponding descriptive text. Traditional approaches to image captioning typically generate descriptive text by extracting different types of visual features \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "356", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{He:2025:UVI, author = "Jiakai He and Yiming Yang and Haifeng Hu and Ruixing Wu", title = "Unsupervised Visible-Infrared Person {ReID} via Modality-Camera Balance Label Refinement", journal = j-TOMM, volume = "21", number = "12", pages = "357:1--357:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3772086", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Unsupervised Learning Visible-Infrared Person Re-Identification (USL-VI-ReID) focuses on developing a cross-modality retrieval model without the need for labels, minimizing the dependence on costly manual annotation across modalities. Recently, various \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "357", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhu:2025:QBK, author = "Xuelin Zhu and Jian Liu and Dongqi Tang and Jiawei Ge and Weijia Liu and Bo Liu and Jiuxin Cao", title = "Query-Based Knowledge Sharing for Open-Vocabulary Multi-Label Classification", journal = j-TOMM, volume = "21", number = "12", pages = "358:1--358:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762195", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Identifying labels that are unseen during training, known as multi-label zero-shot learning, is a non-trivial task in computer vision. Recent studies have increasingly focused on utilizing vision-language pre-training (VLP) models to recognize unseen \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "358", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2025:FRM, author = "Weicheng Xie and Haijian Liang and Zenghao Niu and Xianxu Hou and Siyang Song and Zitong Yu and Linlin Shen", title = "Frequency Restoration and Modality Enforcement towards Resisting-corruption Multimodal Sentiment Analysis", journal = j-TOMM, volume = "21", number = "12", pages = "359:1--359:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767746", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "For Multimodal Sentiment Analysis (MSA), previous methods concentrate on designing sophisticated fusion strategies and performing representation learning across heterogeneous modalities, aiming to leverage multimodal signals to detect human sentiment. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "359", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Kombou:2025:PUA, author = "Victor Kombou and Qi Xia and Hu Xia and Jianbin Gao and Wei Zhang and Benjamin Fabien Eyezo'o and Stephane Richard Befoum and Jonathan Anto Leoba and Brinda Leaticia Kuiche Sop", title = "{PrivaMod}: Uncertainty-Aware Multimedia Fusion with Privacy Guarantees for {NFT} Visual and Transaction Analysis", journal = j-TOMM, volume = "21", number = "12", pages = "360:1--360:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762999", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Non-fungible token (NFT) markets present a dual analytical challenge: integrating heterogeneous data modalities (high-dimensional visual features and discrete transaction sequences) while preserving privacy for sensitive wallet addresses and trading \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "360", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tu:2025:UGH, author = "Junfeng Tu and Xueliang Liu and Yanbin Hao and Richang Hong", title = "A Unified Generative Hashing for Cross-Modal Retrieval", journal = j-TOMM, volume = "21", number = "12", pages = "361:1--361:15", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744567", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Cross-modal hashing is a highly effective and efficient method for information retrieval, enabling the search for correlated data across different modality databases using compact hash codes. Conventional cross-modal hashing typically uses separate model \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "361", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Jian:2025:DIL, author = "Meng Jian and Ruoxi Li and Xiaoyan Gao and Liqiang Wei and Lifang Wu", title = "Dual Interest Learning with Context-Aware Adaptive Interaction for Social Recommendation", journal = j-TOMM, volume = "21", number = "12", pages = "362:1--362:18", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767747", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Social recommendation utilizes social relations to extract auxiliary collaborative signals, effectively mitigating data sparsity issues. However, existing approaches predominantly focus on static influence from social friends while neglecting two \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "362", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2025:EMS, author = "Kaiwei Zhang and Mohan He and Dandan Zhu and Kun Zhu and Xiongkuo Min and Guangtao Zhai", title = "Elevating Mesh Saliency in {VR}: Introducing a Novel Prediction Network and Dataset", journal = j-TOMM, volume = "21", number = "12", pages = "363:1--363:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3761816", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:28 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In computer graphics, polygon meshes stand out as a popular representation providing effective delineation of delicate textures and complex geometries. When dealing with geometric processing tasks for critical regions of the mesh, it is necessary to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "363", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wei:2026:SID, author = "Mingqiang Wei and Qian Sun and Haoran Xie and Dong Liang and Dingkun Zhu and Fu Lee Wang", title = "Search by Image: Deeply Exploring Beneficial Features for Beauty Product Retrieval", journal = j-TOMM, volume = "22", number = "1", pages = "1:1--1:19", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3773765", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Searching by image is popular yet still challenging in e-commerce due to the extensive interference arising from (i) data variations (e.g., background, pose, visual angle, brightness) of real-world captured images and (ii) similar images in the query \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "1", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiong:2026:EPE, author = "Hu Xiong and Hang Yan and Mohammad S. Obaidat and Jingxue Chen and Mingsheng Cao and Sachin Kumar and Kadambri Agarwal and Saru Kumari", title = "Efficient and Privacy-Enhanced Asynchronous Federated Learning for Multimedia Data in Edge-Based {IoT}", journal = j-TOMM, volume = "22", number = "1", pages = "2:1--2:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3688002", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid development of smart device technology, the current version of the Internet of Things (IoT) is moving towards a multimedia IoT because of multimedia data. This innovative concept seamlessly integrates multimedia data with the IoT-Edge \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "2", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Siddiqi:2026:SVD, author = "Sadia Jabeen Siddiqi and Abdulraheem H. Alobaidi and Mian Ahmad Jan and Muhammad Tariq", title = "Securing Vehicle-to-Digital Twin Communications in the {Internet of Vehicles}", journal = j-TOMM, volume = "22", number = "1", pages = "3:1--3:19", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3711863", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The current landscape of data-centric Internet of Vehicles (IoVs) encompasses a fusion of Human-driven Vehicles, Autonomous Vehicles, Road-Side Units, and edge-based devices engaged in periodic communication. Given the stringent latency requirements \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "3", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2026:FPM, author = "Baoping Liu and Bo Liu and Ming Ding and Tianqing Zhu", title = "{ForgeFinder}: Perceptive Multimodal Deepfake Detection via Multi-grained Forgery Localization", journal = j-TOMM, volume = "22", number = "1", pages = "4:1--4:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778030", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Deepfake techniques can now generate multimodal content comprising video and audio tracks. Compared with unimodal Deepfake images, videos or audio, multimodal Deepfake content is more deceptive and easily leads to the dissemination of hate speech, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "4", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xiang:2026:FLB, author = "Haolong Xiang and Xuyun Zhang and Xiaolong Xu and Amin Beheshti and Lianyong Qi and Yujie Hong and Wanchun Dou", title = "Federated Learning-Based Anomaly Detection with Isolation Forest in the {IoT-Edge} Continuum", journal = j-TOMM, volume = "22", number = "1", pages = "5:1--5:19", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3702995", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Traditional methods for ensuring security and privacy face challenges in safeguarding multimedia data within the IoT-edge continuum, as their significant computational demands render them unsuitable for IoT devices with limited resources. Next, we find \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "5", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Nguyen:2026:HGL, author = "Trung Thanh Nguyen and Yasutomo Kawanishi and Takahiro Komamizu and Ichiro Ide", title = "Hierarchical Global-Local Fusion for One-stage Open-vocabulary Temporal Action Detection", journal = j-TOMM, volume = "22", number = "1", pages = "6:1--6:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3773986", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Open-vocabulary Temporal Action Detection (Open-vocab TAD) extends the detection scope of Closed-vocabulary Temporal Action Detection (Closed-vocab TAD) to unseen action classes specified by vocabularies not included in the training data, within \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "6", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2026:CCP, author = "Yuan Wang and Bin Zhu and Yanbin Hao and Chong-Wah Ngo and Yi Tan and Xiang Wang", title = "{CookingDiffusion}: Cooking Procedural Image Generation with Stable Diffusion", journal = j-TOMM, volume = "22", number = "1", pages = "7:1--7:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3771995", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recent advancements in text-to-image generation models have excelled in creating diverse and realistic images. This success extends to food imagery, where various conditional inputs like cooking styles, ingredients, and recipes are utilized. However, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "7", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2026:LFR, author = "Yaning Li and Hao Zhu and Bing-Kun Bao", title = "Light Field Reconstruction Using Multi-orientation Epipolar Plane Images", journal = j-TOMM, volume = "22", number = "1", pages = "8:1--8:22", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777898", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Light field reconstruction is one of the most important techniques for future glass-free 3D media production. However, current techniques suffer from low view-consistency and fixed patterns of view-trajectory. This article presents the 3D Multi-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "8", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2026:HBC, author = "Qing Zhang and Jing Zhang and Xiangdong Su and Feilong Bao and Guanglai Gao", title = "Hyperbolic-Based Cross-Modal Semantic Remodeling Network for Zero-Shot Sketch-Based Image Retrieval", journal = j-TOMM, volume = "22", number = "1", pages = "9:1--9:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777371", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The Zero-Shot Sketch-Based Image Retrieval (ZS-SBIR) task aims to retrieve images associated with sketches from unseen classes, bringing great convenience to the engineering field. To address the modality gap, most existing works project images and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "9", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Tang:2026:HSF, author = "Anni Tang and Zhiyu Zhang and Chen Zhu and Jun Ling and Rong Xie and Li Song", title = "A Hybrid Scheme for Face Video Compression", journal = j-TOMM, volume = "22", number = "1", pages = "10:1--10:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3783982", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "With the rapid development of social media, the amount of face video data has grown rapidly, making face video compression a hot research topic. Traditional video coding techniques do not discriminate video content and compress all videos in the same way,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "10", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Liu:2026:CCM, author = "Xingyu Liu and Yan Jiang and Xu Cheng and Hao Yu and Haoyu Chen and Guoying Zhao", title = "{CROMBO}: Cross-Modality Bootstrapping for Unified Sketch-Photo Representation Learning", journal = j-TOMM, volume = "22", number = "1", pages = "11:1--11:18", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778043", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sketch-photo recognition refers to matching hand-drawn sketches with their corresponding photos, where the performance essentially depends on how well the representations of the two modalities are aligned in the feature spaces. Existing works bluntly \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "11", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2026:QEV, author = "Yan Zhang and Rui Song and Riting Xia and Zhenwei Shi", title = "{QoE} Evaluation for {VR} with Vibrotactile Feedback Based on Inter-user Brain Spatial Information", journal = j-TOMM, volume = "22", number = "1", pages = "12:1--12:20", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777459", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Subjective measurement remains one of the most widely used approaches for evaluating Quality of Experience (QoE) in tactile virtual environments. However, its reliability is often compromised by factors such as conscious bias, variations in user \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "12", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lei:2026:DAT, author = "Jianjun Lei and Duohui Tu and Bo Peng and Jie Zhu and Zhe Zhang and Chong Wu and Qingming Huang", title = "Depth-Aware Transformer for Aerial Localization", journal = j-TOMM, volume = "22", number = "1", pages = "13:1--13:16", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3773767", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, deep learning-based visual localization has gained significant attention and made remarkable advancements. Although previous visual localization methods have obtained promising performance on indoor or outdoor street scenes, there have been few \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "13", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lee:2026:RWN, author = "Seung-Lee Lee and Minjae Kang and Bo Seok Shim and Jong-Uk Hou", title = "Robust {3D} Watermarking for {NeRF}-Induced Modality Shifts", journal = j-TOMM, volume = "22", number = "1", pages = "14:1--14:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3774650", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "This study systematically addresses the issues of copyright infringement that have emerged with the introduction of neural radiance fields (NeRFs) by defining scenarios and applying and analyzing existing protective technologies tailored to each case. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "14", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Gao:2026:CHA, author = "Zan Gao and Xiaoyi Xu and Yibo Zhao and Chunjie Ma and Yanbing Xue and Riwei Wang", title = "A Collaborative Hierarchical Aggregation Network for Weakly Supervised Temporal Action Localization", journal = j-TOMM, volume = "22", number = "1", pages = "15:1--15:18", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778170", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Temporal action localization is a fundamental task in video understanding that focuses on classifying and temporally localizing action instances in untrimmed videos. Compared to temporal action localization, the Weakly supervised Temporal Action \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "15", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lin:2026:MEW, author = "Xianxuan Lin and Bailin Yang and Zhigeng Pan and Chuangxin Cai and Shuang Wang and Aditi Bhattarai and Fan Meng", title = "{MambaWDC}: Efficient Weather Data Compression via Selective State Space Model", journal = j-TOMM, volume = "22", number = "1", pages = "16:1--16:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778033", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The storage and transmission of large-scale meteorological data have become a significant bottleneck hindering the in-depth development of meteorological research. This study proposes a novel meteorological data compression framework characterized by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "16", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhang:2026:BIV, author = "Zeyang Zhang and Hui Li and Tianyang Xu and Xiaojun Wu and Congcong Bian and Josef Kittler", title = "{BusReF}: Infrared-Visible Images Registration and Fusion Focus on Reconstructible Area Using One Set of Features", journal = j-TOMM, volume = "22", number = "1", pages = "17:1--17:19", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3773769", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "In multi-modal imaging scenarios, the misalignment of images presents a persistent challenge. Conventional image fusion algorithms, aiming to enhance the performance of downstream vision tasks, presuppose strictly registered inputs to achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "17", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Lee:2026:DLH, author = "Jooyoung Lee and Se Yoon Jeong and Munchurl Kim", title = "{DeepHQ}: Learned Hierarchical Quantizer for Progressive Deep Image Coding", journal = j-TOMM, volume = "22", number = "1", pages = "18:1--18:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3773994", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Research on entropy model-based Learned Image Compression (LIC) has been actively progressing, leading to rapid advancements in coding efficiency. Beyond improvements in coding efficiency, LIC methods have also been explored for practical codec \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "18", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Singh:2026:SCT, author = "Karanvir Singh and Abdulmotaleb {El Saddik} and Mukesh Saini", title = "A Step Closer Towards the Digital Twin of the Plant", journal = j-TOMM, volume = "22", number = "1", pages = "19:1--19:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3774885", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Digital twins can provide vital insights into agricultural products and processes. There have been a lot of documented attempts at digital twins in agriculture. However, majority of these attempts build synthetic models and ignore the temporal dimension \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "19", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Deng:2026:NNN, author = "Haiyu Deng and Xu Wang and Guangsheng Yu and Wei Ni and Ying He and Tanzeela Altaf and Ren Ping Liu", title = "{NNFMAC}: a Neural Network Fingerprinting-Based Model Authentication Code Scheme", journal = j-TOMM, volume = "22", number = "1", pages = "20:1--20:25", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778121", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "As deep learning-based AI proliferates, model theft and plagiarism pose increasing Intellectual Property (IP) risks. However, watermarking alters model weights and can degrade performance, while fingerprinting often merely verifies uniqueness or requires \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "20", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Dong:2026:EPG, author = "Xin Dong and Lihan Zhang and Aoyang Liu and Xiaojun Liang and Yutao Guo and Yansong Tang", title = "Enhancing Pose-Guided Human Image Generation with Comprehensive and Adjustable {3D} Control", journal = j-TOMM, volume = "22", number = "1", pages = "21:1--21:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778044", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Pose-guided human image generation aims to render a source image in a specific pose. Current methods predominantly employ 2D-based signals, which exhibit inherent information deficits, as pose conditions. This leads to difficulty in establishing precise \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "21", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xia:2026:SSA, author = "Yibo Xia and Qihui Zhan and Xiaoyan Luo and Xiaofeng Shi and Yunhong Wang", title = "{SignMask}: Structure-aware Masked Modeling for Holistic {3D} Sign Language Production", journal = j-TOMM, volume = "22", number = "1", pages = "22:1--22:28", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3776750", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Sign Language Production (SLP) aims to translate spoken textual languages into sign language sequences, which can significantly bridge the communication gap for deaf and hard-of-hearing individuals. Most previous SLP methods typically rely on skeleton-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "22", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zhao:2026:MCE, author = "Liangcheng Zhao and Yueying Wang and Yuhao Qing and Dan Zeng and Li Xu", title = "{MCFINet}: a Cost-Efficient Multi-Channel Feature Integration Network for Surface Scenarios Image Super-Resolution", journal = j-TOMM, volume = "22", number = "1", pages = "23:1--23:17", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777465", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Convolutional Neural Network (CNN) and Vision Transformer (ViT) have revolutionized the field of image super-resolution (SR). However, their complexity poses challenges for resource-constrained scenarios, particularly due to the high computational \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "23", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Wang:2026:EIC, author = "Zhihao Wang and Feifei Zhang and Lingkai Ran and Caixia Song and Ling Zhou", title = "Enhancing Image Captioning through Bridging Image-Text Gap and Reducing Hallucinations", journal = j-TOMM, volume = "22", number = "1", pages = "24:1--24:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3776746", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "While autoregressive models have achieved remarkable success in image captioning, their slow inference speed limits their applicability in real-time scenarios. Non-autoregressive methods provide a promising alternative for faster caption generation; \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "24", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xu:2026:FRU, author = "Ruiji Xu and Junhao Chen and Runzhe Zhang and Guanglin Dai and Keji Mao", title = "{FaceDepth}: a Robust Unimodal Depression Detection Framework Using Invariant Facial Landmark Features", journal = j-TOMM, volume = "22", number = "1", pages = "25:1--25:27", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777463", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Although significant progress has been made in automatic diagnosis systems for depression, most of the work focuses on combining features from multiple modalities to improve classification accuracy, which generates a lot of space-time overhead and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "25", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Li:2026:MSS, author = "Yixuan Li and Lipeng Ma and Weidong Yang and Ben Fei", title = "{3DMambaComplete}: Structured State Space Model for High-Efficiency Point Cloud Completion", journal = j-TOMM, volume = "22", number = "1", pages = "26:1--26:24", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3774887", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Point cloud completion seeks to reconstruct a complete and high-fidelity point cloud from an incomplete and low-quality input. Current methods predominantly rely on Transformer architectures for feature extraction. However, these approaches face two \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "26", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2026:MEL, author = "Shangheng Chen and Quan Fang and Shengsheng Qian and Changsheng Xu", title = "Metapath-Enhanced Language Model Pretraining on Text-Attributed Heterogeneous Graphs", journal = j-TOMM, volume = "22", number = "1", pages = "27:1--27:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3763241", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Text-Attributed Heterogeneous Graphs (TAHGs), which combine text data with various graph relationship information linked to rich semantic entities, are ubiquitous in real-world scenarios. To extract information from TAHGs, a commonly used method is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "27", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Zheng:2026:CLA, author = "Yuanyu Zheng and Lin Zhang and Yunda Sun and Ying Shen and Shengjie Zhao", title = "{CaneSpeaker}: an {LLM}-Assisted Speaker for Generating Human-Like Navigation Instructions", journal = j-TOMM, volume = "22", number = "1", pages = "28:1--28:26", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3785009", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Navigation instruction generation aims to address data scarcity in Vision-and-Language Navigation (VLN) by generating navigation instructions for unannotated routes from data sources like simulators or online data. However, existing methods usually \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "28", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Xie:2026:MHP, author = "Wenjun Xie and Kejun Chen and Dong Wang and Xiaoping Liu", title = "{MatPose}: a {2D} Human Pose Estimation Model with Hybrid Mamba-Transformer", journal = j-TOMM, volume = "22", number = "1", pages = "29:1--29:21", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777469", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "Recently, Mamba has gained widespread attention due to its ability to model long-range dependencies with linear computational complexity. To explore the application of Mamba in 2D human pose estimation, we propose MatPose, a Mamba-Transformer hybrid \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "29", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } @Article{Chen:2026:PVC, author = "Xinyi Chen and Weimin Lei and Wei Zhang and Wenhui Ye and Yanwen Wang", title = "Portrait Video Compression with Semantic-guided Animation Model and Background Incremental Coding", journal = j-TOMM, volume = "22", number = "1", pages = "30:1--30:23", month = jan, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3772085", ISSN = "1551-6857 (print), 1551-6865 (electronic)", ISSN-L = "1551-6857", bibdate = "Mon Feb 2 08:34:31 MST 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib", abstract = "The application of animation models in facial video compression has yielded significant coding gains, particularly at ultra-low bitrates. Despite notable advancements, research on portrait video scenes, especially in half-body and full-body contexts, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Multimed Comput. Commun. Appl.", articleno = "30", fjournal = "ACM Transactions on Multimedia Computing, Communications, and Applications", journal-URL = "https://dl.acm.org/loi/tomm", } %%% [17-Apr-2021] TO DO: where are articles 104 to 116 from volume 16? %%% There is a gap between issues 3s and 4. I reported the problem %%% to ACM on 24 March 2022. The problem persists on [11-Dec-2024].