%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.42", %%% date = "08 October 2025", %%% time = "06:44:25 MDT", %%% filename = "jdiq.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "33139 11948 59934 561755", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "Journal of Data and Information Quality %%% (JDIQ); bibliography", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% the ACM Journal of Data and Information %%% Quality (JDIQ) (CODEN ????, ISSN 1936-1955), %%% covering all journal issues from 2009 -- %%% date. %%% %%% At version 1.42, the COMPLETE journal %%% coverage looked like this: %%% %%% 2009 ( 17) 2015 ( 22) 2021 ( 23) %%% 2010 ( 6) 2016 ( 14) 2022 ( 29) %%% 2011 ( 8) 2017 ( 17) 2023 ( 45) %%% 2012 ( 15) 2018 ( 34) 2024 ( 25) %%% 2013 ( 8) 2019 ( 21) 2025 ( 19) %%% 2014 ( 11) 2020 ( 22) %%% %%% Article: 336 %%% %%% Total entries: 336 %%% %%% The journal table of contents pages are at: %%% %%% http://www.acm.org/jdiq/ %%% http://portal.acm.org/browse_dl.cfm?idx=J1191 %%% https://dl.acm.org/loi/jdiq %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-JDIQ = "Journal of Data and Information Quality (JDIQ)"} %%% ==================================================================== %%% Bibliography entries: @Article{Madnick:2009:EII, author = "Stuart E. Madnick and Yang W. Lee", title = "Editorial for the Inaugural Issue of the {ACM Journal of Data and Information Quality (JDIQ)}", journal = j-JDIQ, volume = "1", number = "1", pages = "1:1--1:??", month = jun, year = "2009", CODEN = "????", ISSN = "1936-1955", bibdate = "Fri Sep 18 15:11:35 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Madnick:2009:OFD, author = "Stuart E. Madnick and Richard Y. Wang and Yang W. Lee and Hongwei Zhu", title = "Overview and Framework for Data and Information Quality Research", journal = j-JDIQ, volume = "1", number = "1", pages = "2:1--2:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1515693.1516680", ISSN = "1936-1955", bibdate = "Fri Sep 18 15:11:35 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Awareness of data and information quality issues has grown rapidly in light of the critical role played by the quality of information in our data-intensive, knowledge-based economy. Research in the past two decades has produced a large body of data quality knowledge and has expanded our ability to solve many data and information quality problems. In this article, we present an overview of the evolution and current landscape of data and information quality research. We introduce a framework to characterize the research along two dimensions: topics and methods. Representative papers are cited for purposes of illustrating the issues addressed and the methods used. We also identify and discuss challenges to be addressed in future research.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Li:2009:BAE, author = "Xiao-Bai Li", title = "A {Bayesian} Approach for Estimating and Replacing Missing Categorical Data", journal = j-JDIQ, volume = "1", number = "1", pages = "3:1--3:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1515693.1515695", ISSN = "1936-1955", bibdate = "Fri Sep 18 15:11:35 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "We propose a new approach for estimating and replacing missing categorical data. With this approach, the posterior probabilities of a missing attribute value belonging to a certain category are estimated using the simple Bayes method. Two alternative methods for replacing the missing value are proposed: The first replaces the missing value with the value having the estimated maximum probability; the second uses a value that is selected with probability proportional to the estimated posterior distribution. The effectiveness of the proposed approach is evaluated based on some important data quality measures for data warehousing and data mining. The results of the experimental study demonstrate the effectiveness of the proposed approach.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Weber:2009:OSD, author = "Kristin Weber and Boris Otto and Hubert {\"O}sterle", title = "One Size Does Not Fit All---{A} Contingency Approach to Data Governance", journal = j-JDIQ, volume = "1", number = "1", pages = "4:1--4:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1515693.1515696", ISSN = "1936-1955", bibdate = "Fri Sep 18 15:11:35 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Enterprizes need Data Quality Management (DQM) to respond to strategic and operational challenges demanding high-quality corporate data. Hitherto, companies have mostly assigned accountabilities for DQM to Information Technology (IT) departments. They have thereby neglected the organizational issues critical to successful DQM. With data governance, however, companies may implement corporate-wide accountabilities for DQM that encompass professionals from business and IT departments. This research aims at starting a scientific discussion on data governance by transferring concepts from IT governance and organizational theory to the previously largely ignored field of data governance. The article presents the first results of a community action research project on data governance comprising six international companies from various industries. It outlines a data governance model that consists of three components (data quality roles, decision areas, and responsibilities), which together form a responsibility assignment matrix. The data governance model documents data quality roles and their type of interaction with DQM activities. In addition, the article describes a data governance contingency model and demonstrates the influence of performance strategy, diversification breadth, organization structure, competitive strategy, degree of process harmonization, degree of market regulation, and decision-making style on data governance. Based on these findings, companies can structure their specific data governance model.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Heinrich:2009:PDM, author = "B. Heinrich and M. Klier and M. Kaiser", title = "A Procedure to Develop Metrics for Currency and its Application in {CRM}", journal = j-JDIQ, volume = "1", number = "1", pages = "5:1--5:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1515693.1515697", ISSN = "1936-1955", bibdate = "Fri Sep 18 15:11:35 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Due to the importance of using up-to-date data in information systems, this article analyzes how the data-quality dimension currency can be quantified. Based on several requirements (e.g., normalization and interpretability) and a literature review, we design a procedure to develop probability-based metrics for currency which can be adjusted to the specific characteristics of data attribute values. We evaluate the presented procedure with regard to the requirements and illustrate the applicability as well as its practical benefit. In cooperation with a major German mobile services provider, the procedure was applied in the field of campaign management in order to improve both success rates and profits.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Madnick:2009:ELS, author = "Stuart E. Madnick and Yang W. Lee", title = "Editorial Letter for the Special Issue on Data Quality in Databases and Information Systems", journal = j-JDIQ, volume = "1", number = "2", pages = "6:1--6:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577841", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Naumann:2009:GES, author = "Felix Naumann and Louiqa Raschid", title = "Guest Editorial for the Special Issue on Data Quality in Databases", journal = j-JDIQ, volume = "1", number = "2", pages = "7:1--7:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577842", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Dash:2009:MLN, author = "Manoranjan Dash and Ayush Singhania", title = "Mining in Large Noisy Domains", journal = j-JDIQ, volume = "1", number = "2", pages = "8:1--8:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577843", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In this article we address the issue of how to mine efficiently in large and noisy data. We propose an efficient sampling algorithm ({\em Concise\/}) as a solution for large and noisy data. Concise is far more superior than the Simple Random Sampling ({\em SRS\/}) in selecting a representative sample. Particularly when the data is very large and noisy, Concise achieves the maximum gain over SRS. The comparison is in terms of their impact on subsequent data mining tasks, specifically, classification, clustering, and association rule mining. We compared Concise with a few existing noise removal algorithms followed by SRS. Although the accuracy of mining results are similar, Concise spends very little time compared to the existing algorithms because Concise has linear time complexity.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "association rule mining; classification; clustering; data mining; Information filtering; sampling; selection process", } @Article{Moustakides:2009:OSR, author = "George V. Moustakides and Vassilios S. Verykios", title = "Optimal Stopping: a Record-Linkage Approach", journal = j-JDIQ, volume = "1", number = "2", pages = "9:1--9:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577844", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Record-linkage is the process of identifying whether two separate records refer to the same real-world entity when some elements of the record's identifying information (attributes) agree and others disagree. Existing record-linkage decision methodologies use the outcomes from the comparisons of the whole set of attributes. Here, we propose an alternative scheme that assesses the attributes sequentially, allowing for a decision to made at any attribute's comparison stage, and thus before exhausting all available attributes. The scheme we develop is optimum in that it minimizes a well-defined average cost criterion while the corresponding optimum solution can be easily mapped into a decision tree to facilitate the record-linkage decision process. Experimental results performed in real datasets indicate the superiority of our methodology compared to existing approaches.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "duplicate detection; optimal stopping; Record-linkage", } @Article{Klein:2009:RDQ, author = "A. Klein and W. Lehner", title = "Representing Data Quality in Sensor Data Streaming Environments", journal = j-JDIQ, volume = "1", number = "2", pages = "10:1--10:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577845", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Sensors in smart-item environments capture data about product conditions and usage to support business decisions as well as production automation processes. A challenging issue in this application area is the restricted quality of sensor data due to limited sensor precision and sensor failures. Moreover, data stream processing to meet resource constraints in streaming environments introduces additional noise and decreases the data quality. In order to avoid wrong business decisions due to dirty data, quality characteristics have to be captured, processed, and provided to the respective business task. However, the issue of how to efficiently provide applications with information about data quality is still an open research problem.\par In this article, we address this problem by presenting a flexible model for the propagation and processing of data quality. The comprehensive analysis of common data stream processing operators and their impact on data quality allows a fruitful data evaluation and diminishes incorrect business decisions. Further, we propose the data quality model control to adapt the data quality granularity to the data stream interestingness.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "data quality; Data stream processing; smart items", } @Article{Embury:2009:IDS, author = "Suzanne M. Embury and Paolo Missier and Sandra Sampaio and R. Mark Greenwood and Alun D. Preece", title = "Incorporating Domain-Specific Information Quality Constraints into Database Queries", journal = j-JDIQ, volume = "1", number = "2", pages = "11:1--11:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577846", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The range of information now available in queryable repositories opens up a host of possibilities for new and valuable forms of data analysis. Database query languages such as SQL and XQuery offer a concise and high-level means by which such analyses can be implemented, facilitating the extraction of relevant data subsets into either generic or bespoke data analysis environments. Unfortunately, the quality of data in these repositories is often highly variable. The data is still useful, but only if the consumer is aware of the data quality problems and can work around them. Standard query languages offer little support for this aspect of data management. In principle, however, it should be possible to embed constraints describing the consumer's data quality requirements into the query directly, so that the query evaluator can take over responsibility for enforcing them during query processing.\par Most previous attempts to incorporate information quality constraints into database queries have been based around a small number of highly generic quality measures, which are defined and computed by the information provider. This is a useful approach in some application areas but, in practice, quality criteria are more commonly determined by the user of the information not by the provider. In this article, we explore an approach to incorporating quality constraints into database queries where the definition of quality is set by the user and not the provider of the information. Our approach is based around the concept of a {\em quality view}, a configurable quality assessment component into which domain-specific notions of quality can be embedded. We examine how quality views can be incorporated into XQuery, and draw from this the language features that are required in general to embed quality views into any query language. We also propose some syntactic sugar on top of XQuery to simplify the process of querying with quality constraints.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "database query languages; Information quality; views; XQuery", } @Article{Madnick:2009:CPS, author = "Stuart E. Madnick and Yang W. Lee", title = "Call for Papers Special Issue on Healthcare Information Quality: the Challenges and Opportunities in Healthcare Systems and Services", journal = j-JDIQ, volume = "1", number = "2", pages = "12:1--12:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1577840.1577847", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:40 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Madnick:2009:ECW, author = "Stuart E. Madnick and Yang W. Lee", title = "Editors' Comments: Where the {JDIQ} Articles Come From: Incubating Research in an Emerging Field", journal = j-JDIQ, volume = "1", number = "3", pages = "13:1--13:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1659225.1659226", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Sessions:2009:TMD, author = "V. Sessions and M. Valtorta", title = "Towards a Method for Data Accuracy Assessment Utilizing a {Bayesian} Network Learning Algorithm", journal = j-JDIQ, volume = "1", number = "3", pages = "14:1--14:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1659225.1659227", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This research develops a data quality algorithm entitled the Accuracy Assessment Algorithm (AAA). This is an extension of research in developing an enhancement to a Bayesian Network (BN) learning algorithm called the Data Quality (DQ) algorithm. This new algorithm is concerned with estimating the accuracy levels of a dataset by assessing the quality of the data with no prior knowledge of the dataset. The AAA and associated metrics were tested using two canonical BNs and one large-scale medical network. The article presents the results regarding the efficacy of the algorithm and the implications for future research and practice.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "accuracy levels; Bayesian networks; data quality assessment; PC algorithm", } @Article{Even:2009:DAD, author = "Adir Even and G. Shankaranarayanan", title = "Dual Assessment of Data Quality in Customer Databases", journal = j-JDIQ, volume = "1", number = "3", pages = "15:1--15:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1659225.1659228", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Quantitative assessment of data quality is critical for identifying the presence of data defects and the extent of the damage due to these defects. Quantitative assessment can help define realistic quality improvement targets, track progress, evaluate the impacts of different solutions, and prioritize improvement efforts accordingly. This study describes a methodology for quantitatively assessing both impartial {\em and\/} contextual data quality in large datasets. Impartial assessment measures the extent to which a dataset is defective, independent of the context in which that dataset is used. Contextual assessment, as defined in this study, measures the extent to which the presence of defects reduces a dataset's utility, the benefits gained by using that dataset in a specific context. The dual assessment methodology is demonstrated in the context of Customer Relationship Management (CRM), using large data samples from real-world datasets. The results from comparing the two assessments offer important insights for directing quality maintenance efforts and prioritizing quality improvement solutions for this dataset. The study describes the steps and the computation involved in the dual-assessment methodology and discusses the implications for applying the methodology in other business contexts and data environments.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "CRM; customer relationship management; databases; Data quality; information value; total data quality management", } @Article{Fisher:2009:AMP, author = "Craig W. Fisher and Eitel J. M. Lauria and Carolyn C. Matheus", title = "An Accuracy Metric: Percentages, Randomness, and Probabilities", journal = j-JDIQ, volume = "1", number = "3", pages = "16:1--16:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1659225.1659229", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Practitioners and researchers regularly refer to error rates or accuracy percentages of databases. The former is the number of cells in error divided by the total number of cells; the latter is the number of correct cells divided by the total number of cells. However, databases may have similar error rates (or accuracy percentages) but differ drastically in the complexity of their accuracy problems. A simple percent does not provide information as to whether the errors are systematic or randomly distributed throughout the database. We expand the accuracy metric to include a randomness measure and include a probability distribution value. The proposed randomness check is based on the Lempel--Ziv (LZ) complexity measure. Through two simulation studies we show that the LZ complexity measure can clearly differentiate as to whether the errors are random or systematic. This determination is a significant first step and is a major departure from the percentage-alone technique. Once it is determined that the errors are random, a probability distribution, Poisson, is used to help address various managerial questions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "complexity; Data and information quality; randomness", } @Article{Ababneh:2009:CSE, author = "Sufyan Ababneh and Rashid Ansari and Ashfaq Khokhar", title = "Compensated Signature Embedding for Multimedia Content Authentication", journal = j-JDIQ, volume = "1", number = "3", pages = "17:1--17:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1659225.1659230", ISSN = "1936-1955", bibdate = "Wed Mar 17 14:47:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "One of the main goals of digital content authentication and preservation techniques is to guarantee the originality and quality of the information. In this article, robust watermarking is used to embed content-based fragile signatures in multimedia signals to achieve efficient authentication without requiring any third-party reference or side information. To overcome the signature alteration caused by the embedding perturbation and other possible encoding operations, a closed-form compensation technique is proposed for ensuring signature consistency by employing a Lagrangian-based approach. A minimum distortion criterion is used to ensure signal quality. The effectiveness of the proposed approach is investigated with simulations of examples of image authentication in which signatures are designed to reveal tamper localization. Results using quantitative performance criteria show successful authentication over a range of robustness in embedding watermarks using both QIM-DM and spread-spectrum techniques. A comparison with two iterative compensation schemes is also presented.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "compensated signature embedding; Content authentication; watermarking", } @Article{Madnick:2010:ECA, author = "Stuart E. Madnick and Yang W. Lee", title = "{Editors}' Comments: {ACM Journal of Data and Information Quality (JDIQ)} is alive and well!", journal = j-JDIQ, volume = "2", number = "1", pages = "1:1--1:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805287", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Tremblay:2010:UDM, author = "Monica Chiarini Tremblay and Kaushik Dutta and Debra Vandermeer", title = "Using Data Mining Techniques to Discover Bias Patterns in Missing Data", journal = j-JDIQ, volume = "2", number = "1", pages = "2:1--2:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805288", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In today's data-rich environment, decision makers draw conclusions from data repositories that may contain data quality problems. In this context, missing data is an important and known problem, since it can seriously affect the accuracy of conclusions drawn. Researchers have described several approaches for dealing with missing data, primarily attempting to infer values or estimate the impact of missing data on conclusions. However, few have considered approaches to characterize patterns of bias in missing data, that is, to determine the specific attributes that predict the missingness of data values. Knowledge of the specific systematic bias patterns in the incidence of missing data can help analysts more accurately assess the quality of conclusions drawn from data sets with missing data. This research proposes a methodology to combine a number of Knowledge Discovery and Data Mining techniques, including association rule mining, to discover patterns in related attribute values that help characterize these bias patterns. We demonstrate the efficacy of our proposed approach by applying it on a demo census dataset seeded with biased missing data. The experimental results show that our approach was able to find seeded biases and filter out most seeded noise.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "Data quality; missing data; pattern discovery", } @Article{Jensen:2010:JCI, author = "Matthew L. Jensen and Judee K. Burgoon and Jay F. {Nunamaker, Jr.}", title = "Judging the Credibility of Information Gathered from Face-to-Face Interactions", journal = j-JDIQ, volume = "2", number = "1", pages = "3:1--3:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805289", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "One of the most pernicious threats to information quality comes through perpetration of deception by information suppliers. Deception undermines many critical dimensions of information quality, such as accuracy, completeness, and believability. Despite this threat, information gatherers are ill equipped to assess the credibility of information suppliers. This work presents a prototype system that examines messages gathered during direct, face-to-face information gathering. The system unobtrusively identifies kinesic and linguistic features that may indicate deception in information suppliers' messages. System use was found to significantly improve assessment ability in between-subjects and within-subjects tests. The improved ability to accurately assess credibility during face-to-face interactions should yield higher information quality.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "Credibility assessment; deception detection; decision-aids; human-computer interaction; information veracity; kinesics; linguistics", } @Article{Meda:2010:DDF, author = "Hema S. Meda and Anup Kumar Sen and Amitava Bagchi", title = "On Detecting Data Flow Errors in Workflows", journal = j-JDIQ, volume = "2", number = "1", pages = "4:1--4:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805290", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "When designing a business workflow, it is customary practice to create the control flow structure first and to ensure its correctness. Information about the flow of data is introduced subsequently into the workflow and its correctness is independently verified. Improper specification of data requirements of tasks and XOR splits can cause problems such as wrong branching at XOR splits and the failure of tasks to execute. Here we present a graph traversal algorithm called GTforDF for detecting data flow errors in both nested and unstructured workflows, and illustrate its operation on realistic examples. Two of these have interconnected loops and are free of control flow errors, and the third one is an unstructured loop-free workflow. Our approach extends and generalizes data flow verification methods that have been recently proposed. It also makes use of the concept of corresponding pairs lately introduced in control flow verification. It thus has the potential for development into a unified algorithmic procedure for the concurrent detection of control flow and data flow errors. The correctness of the algorithm has been proved theoretically. It has also been tested experimentally on many examples.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "Corresponding pair; Data flow errors; Workflow management", } @Article{Magnani:2010:SUM, author = "Matteo Magnani and Danilo Montesi", title = "A Survey on Uncertainty Management in Data Integration", journal = j-JDIQ, volume = "2", number = "1", pages = "5:1--5:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805291", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In the last few years, uncertainty management has come to be recognized as a fundamental aspect of data integration. It is now accepted that it may not be possible to remove uncertainty generated during data integration processes and that uncertainty in itself may represent a source of relevant information. Several issues, such as the aggregation of uncertain mappings and the querying of uncertain mediated schemata, have been addressed by applying well-known uncertainty management theories. However, several problems lie unresolved. This article sketches an initial picture of this highly active research area; it details existing works in the light of a homogeneous framework, and identifies and discusses the leading issues awaiting solutions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", keywords = "Data integration; uncertainty", } @Article{Talburt:2010:CPS, author = "John R. Talburt and Stuart E. Madnick and Yang W. Lee", title = "Call for Papers: Special Issue on Entity Resolution", journal = j-JDIQ, volume = "2", number = "1", pages = "6:1--6:??", month = jul, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1805286.1805292", ISSN = "1936-1955", bibdate = "Tue Sep 7 08:41:54 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Madnick:2011:ESN, author = "Stuart E. Madnick and Yang W. Lee", title = "Editorial: In Search of Novel Ideas and Solutions with a Broader Context of Data Quality in Mind", journal = j-JDIQ, volume = "2", number = "2", pages = "7:1--7:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1891879.1891880", ISSN = "1936-1955", bibdate = "Mon Mar 28 12:03:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Blake:2011:EID, author = "Roger Blake and Paul Mangiameli", title = "The Effects and Interactions of Data Quality and Problem Complexity on Classification", journal = j-JDIQ, volume = "2", number = "2", pages = "8:1--8:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1891879.1891881", ISSN = "1936-1955", bibdate = "Mon Mar 28 12:03:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Gelman:2011:GGA, author = "Irit Askira Gelman", title = "{GIGO} or not {GIGO}: The Accuracy of Multi-Criteria Satisficing Decisions", journal = j-JDIQ, volume = "2", number = "2", pages = "9:1--9:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1891879.1891882", ISSN = "1936-1955", bibdate = "Mon Mar 28 12:03:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Fan:2011:GBN, author = "Xiaoming Fan and Jianyong Wang and Xu Pu and Lizhu Zhou and Bing Lv", title = "On Graph-Based Name Disambiguation", journal = j-JDIQ, volume = "2", number = "2", pages = "10:1--10:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1891879.1891883", ISSN = "1936-1955", bibdate = "Mon Mar 28 12:03:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ngugi:2011:TBI, author = "Benjamin Ngugi and Beverly K. Kahn and Marilyn Tremaine", title = "Typing Biometrics: Impact of Human Learning on Performance Quality", journal = j-JDIQ, volume = "2", number = "2", pages = "11:1--11:??", month = feb, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1891879.1891884", ISSN = "1936-1955", bibdate = "Mon Mar 28 12:03:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Madnick:2011:ENC, author = "Stuart E. Madnick and Yang W. Lee", title = "Editorial Notes: Classification and Assessment of Large Amounts of Data: Examples in the Healthcare Industry and Collaborative Digital Libraries", journal = j-JDIQ, volume = "2", number = "3", pages = "12:1--12:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2063504.2063505", ISSN = "1936-1955", bibdate = "Thu Dec 15 09:41:55 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Lauria:2011:CBT, author = "Eitel J. M. Laur{\'\i}a and Alan D. March", title = "Combining {Bayesian} Text Classification and Shrinkage to Automate Healthcare Coding: a Data Quality Analysis", journal = j-JDIQ, volume = "2", number = "3", pages = "13:1--13:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2063504.2063506", ISSN = "1936-1955", bibdate = "Thu Dec 15 09:41:55 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Dalip:2011:AAD, author = "Daniel Hasan Dalip and Marcos Andr{\'e} Gon{\c{c}}alves and Marco Cristo and P{\'a}vel Calado", title = "Automatic Assessment of Document Quality in {Web} Collaborative Digital Libraries", journal = j-JDIQ, volume = "2", number = "3", pages = "14:1--14:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2063504.2063507", ISSN = "1936-1955", bibdate = "Thu Dec 15 09:41:55 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Muller:2012:IDQ, author = "Heiko M{\"u}ller and Johann-Christoph Freytag and Ulf Leser", title = "Improving data quality by source analysis", journal = j-JDIQ, volume = "2", number = "4", pages = "15:1--15:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2107536.2107538", ISSN = "1936-1955", bibdate = "Fri Mar 16 15:01:48 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In many domains, data cleaning is hampered by our limited ability to specify a comprehensive set of integrity constraints to assist in identification of erroneous data. An alternative approach to improve data quality is to exploit different data sources that contain information about the same set of objects. Such overlapping sources highlight hot-spots of poor data quality through conflicting data values and immediately provide alternative values for conflict resolution. In order to derive a dataset of high quality, we can merge the overlapping sources based on a quality assessment of the conflicting values. The quality of the resulting dataset, however, is highly dependent on our ability to asses the quality of conflicting values effectively. The main objective of this article is to introduce methods that aid the developer of an integrated system over overlapping, but contradicting sources in the task of improving the quality of data. Value conflicts between contradicting sources are often systematic, caused by some characteristic of the different sources. Our goal is to identify such systematic differences and outline data patterns that occur in conjunction with them. Evaluated by an expert user, the regularities discovered provide insights into possible conflict reasons and help to assess the quality of inconsistent values. The contributions of this article are two concepts of systematic conflicts: contradiction patterns and minimal update sequences. Contradiction patterns resemble a special form of association rules that summarize characteristic data properties for conflict occurrence. We adapt existing association rule mining algorithms for mining contradiction patterns. Contradiction patterns, however, view each class of conflicts in isolation, sometimes leading to largely overlapping patterns. Sequences of set-oriented update operations that transform one data source into the other are compact descriptions for all regular differences among the sources. We consider minimal update sequences as the most likely explanation for observed differences between overlapping data sources. Furthermore, the order of operations within the sequences point out potential dependencies between systematic differences. Finding minimal update sequences, however, is beyond reach in practice. We show that the problem already is NP-complete for a restricted set of operations. In the light of this intractability result, we present heuristics that lead to convincing results for all examples we considered.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Gelman:2012:BMC, author = "Irit Askira Gelman", title = "Biases in multi-criteria, satisfying decisions due to data errors", journal = j-JDIQ, volume = "2", number = "4", pages = "16:1--16:??", month = feb, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2107536.2107539", ISSN = "1936-1955", bibdate = "Fri Mar 16 15:01:48 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This inquiry centers on an asymmetry, or bias, in the accuracy of multi-criteria, conjunctive, and disjunctive decisions, which originates from fundamental properties of the logical conjunction and disjunction operations. A mathematical-statistical analysis indicates that, as we keep adding criteria to a multi-criteria conjunctive or disjunctive decision rule, errors in the data produce decision errors asymmetrically. As a result, in conjunctive decisions, the probability of a false negative increases while the probability of a false positive decreases. In contrast, in disjunctive decisions, as we keep adding criteria, the probability of a false positive increases while that of a false negative decreases. For instance, in a conjunctive business decision rule, the probability of overlooking a bargain can be far greater than the probability of misjudging an unattractive offer to be a good one. A series of Monte Carlo simulations validates the analytical findings and explores the contribution of several additional factors.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Sachdeva:2012:SIS, author = "Shelly Sachdeva and Subhash Bhalla", title = "Semantic interoperability in standardized electronic health record databases", journal = j-JDIQ, volume = "3", number = "1", pages = "1:1--1:??", month = apr, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2166788.2166789", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:12 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Different clinics and hospitals have their own information systems to maintain patient data. This hinders the exchange of data among systems (and organizations). Hence there is a need to provide standards for data exchange. In digitized form, the individual patient's medical record can be stored, retrieved, and shared over a network through enhancement in information technology. Thus, electronic health records (EHRs) should be standardized, incorporating semantic interoperability. A subsequent step requires that healthcare professionals and patients get involved in using the EHRs, with the help of technological developments. This study aims to provide different approaches in understanding some current and challenging concepts in health informatics. Successful handling of these challenges will lead to improved quality in healthcare by reducing medical errors, decreasing costs, and enhancing patient care. The study is focused on the following goals: (1) understanding the role of EHRs; (2) understanding the need for standardization to improve quality; (3) establishing interoperability in maintaining EHRs; (4) examining a framework for standardization and interoperability (the openEHR architecture); (5) identifying the role of archetypes for knowledge-based systems; and (6) understanding the difficulties in querying HER data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Brown:2012:DQT, author = "Steven Brown and Trent S. Rosenbloom and Shawn P. Hardenbrook and Terry Clark and Elliot Fielstein and Peter Elkin and Ted Speroff", title = "Documentation quality and time costs: a randomized controlled trial of structured entry versus dictation", journal = j-JDIQ, volume = "3", number = "1", pages = "2:1--2:??", month = apr, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2166788.2166790", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:12 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The Department of Veterans Affairs (VA) performs over 800,000 disability exams and distributes over {\&}dollor;37 billion in disability benefits per year. VA developed and deployed a computer-based disability exam documentation system in order to improve exam report quality and timeliness. We conducted a randomized controlled trial comparing joint disability examinations supported by computerized templates to the examinations documented via dictation, to determine if the system met the intended goals or had unintended consequences. Consenting veterans were randomized to undergo exams documented using computerized templates or via dictation. We compared exam report quality, documentation time costs, encounter length, total time to fulfill an exam request with a finalized exam report, and veteran satisfaction. Computer-based templates resulted in disability exam reports that had higher quality scores (p. 0.042) and were returned to the requesting office faster than exam reports created via dictation (p. 0.02). Documentation time and veteran satisfaction were similar for both the documentation techniques. Encounter length was significantly longer for the template group. Computer-based templates impacted the VA disability evaluation system by improving report quality scores and production time and lengthening encounter times. Oversight bodies have called for mandated use of computer-based templates nationwide. We believe mandates regarding use of health information technology should be guided by data regarding its positive and negative impacts.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Sunyaev:2012:SCD, author = "Ali Sunyaev and Dmitry Chornyi", title = "Supporting chronic disease care quality: Design and implementation of a health service and its integration with electronic health records", journal = j-JDIQ, volume = "3", number = "2", pages = "3:1--3:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2184442.2184443", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:12 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Chronic medical conditions take a huge toll on lives of a growing number of people and are a major contributor to the rising costs in healthcare. As patients are increasingly willing to take an active part in managing their conditions, chronic disease self-management programs and information systems that support them are recognized for their potential to improve the quality of healthcare delivery. These programs often rely on recording longitudinal patient data and analyzing it. Therefore, maintaining appropriate data quality is important for self-management programs to be efficient and safe. We designed and implemented a prototype of a health self-management service for chronically ill people. It is a distributed application that supports patients with diabetes at tracking their blood glucose levels. The main design goals were usability, extensibility, security, and interoperability. The system integrates with the Microsoft HealthVault and Google Health personal health record platforms. It utilizes industry-strength storage and security mechanisms, is scalable, and as a result, can be used to gather, securely store, and analyze patient data over long periods of time. In this article we examine how software information technology can support chronic disease self-management and its impact on the quality of patient data. Furthermore, we describe the requirements that drove the system's development, its architecture, and design decisions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Elizabeth:2012:NSA, author = "D. Shiloah Elizabeth and H. Khanna Nehemiah and C. Sunil Retmin Raj and A. Kannan", title = "A novel segmentation approach for improving diagnostic accuracy of {CAD} systems for detecting lung cancer from chest computed tomography images", journal = j-JDIQ, volume = "3", number = "2", pages = "4:1--4:??", month = may, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2184442.2184444", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:12 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Segmentation of lung tissue is an important and challenging task in any computer aided diagnosis system. The accuracy of the segmentation subsystem determines the performance of the other subsystems in any computer aided diagnosis system based on image analysis. We propose a novel technique for segmentation of lung tissue from computed tomography of the chest. Manual segmentation of lung parenchyma becomes difficult with an enormous volume of images. The goal of this work is to present an automated approach to segmentation of lung parenchyma from the rest of the chest CT image. The approach involves the conventional optimal thresholding technique and operations based on convex edge and centroid properties of the lung region. The segmentation technique proposed in this article can be used to preprocess lung images given to a computer aided diagnosis system for diagnosis of lung disorders. This improves the diagnostic performance of the system. This has been tested by using it in a computer aided diagnosis system that was used for detection of lung cancer from chest computed tomography images. The results obtained show that the lungs can be correctly segmented even in the presence of peripheral pathology bearing regions; pathology bearing regions that could not be detected using a CAD system that applies optimal thresholding could be detected using a CAD system using out proposed approach for segmentation of lungs.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Yakout:2012:EPA, author = "Mohamed Yakout and Mikhail J. Atallah and Ahmed Elmagarmid", title = "Efficient and Practical Approach for Private Record Linkage", journal = j-JDIQ, volume = "3", number = "3", pages = "5:1--5:??", month = aug, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2287714.2287715", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:13 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Record linkage is used to associate entities from multiple data sources. For example, two organizations contemplating a merger may want to know how common their customer bases are so that they may better assess the benefits of the merger. Another example is a database of people who are forbidden from a certain activity by regulators, may need to be compared to a list of people engaged in that activity. The autonomous entities who wish to carry out the record matching computation are often reluctant to fully share their data; they fear losing control over its subsequent dissemination and usage, or they want to insure privacy because the data is proprietary or confidential, and/or they are cautious simply because privacy laws forbid its disclosure or regulate the form of that disclosure. In such cases, the problem of carrying out the linkage computation without full data exchange has been called private record linkage. Previous private record linkage techniques have made use of a third party. We provide efficient techniques for private record linkage that improve on previous work in that (1) our techniques make no use of a third party, and (2) they achieve much better performance than previous schemes in terms of their execution time while maintaining acceptable quality of output compared to nonprivacy settings. Our protocol consists of two phases. The first phase primarily produces candidate record pairs for matching, by carrying out a very fast (but not accurate) matching between such pairs of records. The second phase is a novel protocol for efficiently computing distances between each candidate pair (without any expensive cryptographic operations such as modular exponentiations). Our experimental evaluation of our approach validates these claims.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Yang:2012:ECD, author = "Yanjuan Yang and Michael Mannino", title = "An Experimental Comparison of a Document Deception Detection Policy using Real and Artificial Deception", journal = j-JDIQ, volume = "3", number = "3", pages = "6:1--6:??", month = aug, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2287714.2287716", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:13 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Developing policies to screen documents for deception is often hampered by the cost of data collection and the inability to evaluate policy alternatives due to lack of data. To lower data collection costs and increase the amount of data, artificially generated deception data can be used, but the impact of using artificially generated deception data is not well understood. This article studies the impact of artificially generated deception on document screening policies. The deception and truth data were collected from financial aid applications, a document-centric area with limited resources for screening. Real deception was augmented with artificial data generated by noise and deception generation models. Using the real data and artificially generated data, we designed an innovative experiment with deception type and deception rate as factors, and harmonic mean and cost as outcome variables. We used two budget models (fixed and variable) typically employed by financial aid offices to measure the cost of noncompliance in financial aid applications. The analysis included an evaluation of a common policy for deception screening using both fixed and varying screening rates. The results of the experiment provided evidence of similar performance of screening policy with real and artificial deception, suggesting the possibility of using artificially generated deception to reduce the costs associated with obtaining training data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Robb:2012:INU, author = "David A. Robb and Paul L. Bowen and A. Faye Borthick and Fiona H. Rohde", title = "Improving New Users' Query Performance: Deterring Premature Stopping of Query Revision with Information for Forming Ex Ante Expectations", journal = j-JDIQ, volume = "3", number = "4", pages = "7:1--7:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348828.2348829", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "As the volume of data in organizational databases grows, organizations are seeking to use this data to improve organizational success. To this end, users are being asked to query these databases to provide information to help answer questions posed by key management personnel. Users who have had extensive experience with an organization's data can often detect the presence of errors in their queries when query results do not correspond to their ex ante expectations. New users, however, are less familiar with the data they will be querying. Having no, or limited, ex ante expectations for query results, new users may be unaware that the result produced by their query is incorrect. Unwarranted confidence in the correctness of their queries predisposes these users to stop looking for query errors even when their queries still contain errors. This behavior, premature stopping of query revision, prompts investigating whether new users' query performance would improve if they were not only provided with, but used, readily available information to form ex ante expectations. Our results demonstrated a threshold effect in new users heeding information for forming ex ante expectations. That is, the mere availability of information for forming ex ante expectations made no difference in query performance. When admonishing users to heed ex ante information, however, there was an associated increase in the accuracy of their queries. These results suggest that users unfamiliar with a particular database might make fewer query errors if they not only received readily available information but were then prompted to use the information to form ex ante expectations for query results.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Varol:2012:HMA, author = "Cihan Varol and Coskun Bayrak", title = "Hybrid Matching Algorithm for Personal Names", journal = j-JDIQ, volume = "3", number = "4", pages = "8:1--8:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2348828.2348830", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib; https://www.math.utah.edu/pub/tex/bib/spell.bib", abstract = "Companies acquire personal information from phone, World Wide Web, or email in order to sell or send an advertisement about their product. However, when this information is acquired, moved, copied, or edited, the data may lose its quality. Often, the use of data administrators or a tool that has limited capabilities to correct the mistyped information can cause many problems. Moreover, most of the correction techniques are particularly implemented for the words used in daily conversations. Since personal names have different characteristics compared to general text, a hybrid matching algorithm (PNRS) which employs phonetic encoding, string matching and statistical facts to provide a possible candidate for misspelled names is developed. At the end, the efficiency of the proposed algorithm is compared with other well known spelling correction techniques.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{ODonoghue:2012:ISI, author = "John O'Donoghue and Jane Grimson and Katherine Seelman", title = "Introduction to the Special Issue on Information Quality: The Challenges and Opportunities in Healthcare Systems and Services", journal = j-JDIQ, volume = "4", number = "1", pages = "1:1--1:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2378016.2378017", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Collins:2012:CGF, author = "Claire Collins and Kelly Janssens", title = "Creating a General (Family) Practice Epidemiological Database in {Ireland} --- Data Quality Issue Management", journal = j-JDIQ, volume = "4", number = "1", pages = "2:1--2:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2378016.2378018", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In Ireland, while detailed information is available regarding hospital attendance, little is known regarding general (family) practice attendance. However, it is conservatively estimated that there are almost nine times as many general practice encounters than there are hospital encounters each year in Ireland. This represents a very significant gap in health information. Indeed, general practice has been shown in other countries to be an important and rich source of information about the health of the population, their behaviors and their utilization of health services. Funded by the Health Information and Quality Authority (HIQA), the Irish College of General Practitioners (ICGP) undertook a feasibility study of diagnostic coding of routinely entered patient data and the creation of a national general practice morbidity and epidemiological database (GPMED project). This article outlines the process of data quality issue management undertaken. The study's findings suggest that the quality of data collection and reporting structures available in general practice throughout Ireland at the outset of this project were not adequate to permit the creation of a database of sufficient quality for service planning and policy or epidemiological research. Challenges include the dearth of a minimum standard of data recorded in consultations by GPs and the absence of the digital data recording and exporting infrastructure within Irish patient management software systems. In addition, there is at present a lack of recognition regarding the value of such data for patient management and service planning---including importantly, data collectors who do not fully accept the merit of maintaining data, which has a direct consequence for data quality. The work of this project has substantial implications for the data available to the health sector in Ireland and contributes to the knowledge base internationally regarding general practice morbidity data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Cure:2012:IDQ, author = "Olivier Cur{\'e}", title = "Improving the Data Quality of Drug Databases using Conditional Dependencies and Ontologies", journal = j-JDIQ, volume = "4", number = "1", pages = "3:1--3:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2378016.2378019", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Many health care systems and services exploit drug related information stored in databases. The poor data quality of these databases, e.g. inaccuracy of drug contraindications, can lead to catastrophic consequences for the health condition of patients. Hence it is important to ensure their quality in terms of data completeness and soundness. In the database domain, standard Functional Dependencies (FDs) and INclusion Dependencies (INDs), have been proposed to prevent the insertion of incorrect data. But they are generally not expressive enough to represent a domain-specific set of constraints. To this end, conditional dependencies, i.e. standard dependencies extended with tableau patterns containing constant values, have been introduced and several methods have been proposed for their discovery and representation. The quality of drug databases can be considerably improved by their usage. Moreover, pharmacology information is inherently hierarchical and many standards propose graph structures to represent them, e.g. the Anatomical Therapeutic Chemical classification (ATC) or OpenGalen's terminology. In this article, we emphasize that the technologies of the Semantic Web are adapted to represent these hierarchical structures, i.e. in RDFS and OWL. We also present a solution for representing conditional dependencies using a query language defined for these graph oriented structures, namely SPARQL. The benefits of this approach are interoperability with applications and ontologies of the Semantic Web as well as a reasoning-based query execution solution to clean underlying databases.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{McNaull:2012:DIQ, author = "James McNaull and Juan Carlos Augusto and Maurice Mulvenna and Paul McCullagh", title = "Data and Information Quality Issues in Ambient Assisted Living Systems", journal = j-JDIQ, volume = "4", number = "1", pages = "4:1--4:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2378016.2378020", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Demographic aging, as a result of people living for longer, has put an increased burden on health and social care provision across most of the economies of the developed and developing world. In order to cope with the greater numbers of older people, together with increasing prevalence of chronic diseases, governments are looking to new ways to provide care and support to older people and their care providers. A growing trend is where health and social care providers are moving towards the use of assisted living technologies to provide care and assistance in the home. In this article, the research area of Ambient Assisted Living (AAL) systems is examined and the data, information and the higher-level contextual knowledge quality issues in relation to these systems, is discussed. Lack of quality control may result in an AAL system providing assistance and support based upon incorrect data, information and knowledge inputs, and this may have a detrimental effect on the person making use of the system. We propose a model whereby contextual knowledge gained during the AAL system's reasoning cycle can be fed back to aid in further quality checking at the various architectural layers, and a realistic AAL scenario is provided to support this. Future research should be conducted in these areas, with the requirement of building quality criteria into the design and implementation of AAL systems.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{ODonoghue:2012:DMW, author = "John O'Donoghue and John Herbert", title = "Data Management within {mHealth} Environments: Patient Sensors, Mobile Devices, and Databases", journal = j-JDIQ, volume = "4", number = "1", pages = "5:1--5:??", month = oct, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2378016.2378021", ISSN = "1936-1955", bibdate = "Thu Nov 8 18:27:14 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Pervasive environments generate large quantities of data, originating from backend servers, portable devices, and wireless mobile sensors. Pervasive sensing devices that monitor properties of the environment (including human beings) can be a large data source. Unprocessed datasets may include data that is faulty and irrelevant, and data that is important and useful. If not managed correctly the large amount of data from a data-rich pervasive environment may result in information overload or delivery of incorrect information. Context-sensitive quality data management aims to gather, verify, process, and manage the multiple data sources in a pervasive environment in order to deliver high quality, relevant information to the end-user. Managing the quality of data from different sources, correlating related data, and making use of context, are all essential in providing end users with accurate and meaningful data in real time. This requirement is especially true for critical applications such as in a medical environment. This article presents the Data Management System (DMS) architecture. It is designed to deliver quality data service to its users. The DMS architecture employs an agent-based middleware to intelligently and effectively manage all pervasive data sources, and to make use of context to deliver relevant information to the end-user. Two of the DMS components are presented: (1) data validation and (2) data consistency. The DMS components have been rigorously evaluated using various medical-based test cases. This article demonstrates a careful, precise approach to data based on the quality of the data and the context of its use. It emphasises the DMS architecture and the role of software agents in providing quality data management.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Talburt:2013:SIE, author = "John R. Talburt", title = "Special Issue on Entity Resolution Overview: The Criticality of Entity Resolution in Data and Information Quality", journal = j-JDIQ, volume = "4", number = "2", pages = "6:1--6:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2435221.2435222", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:00 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Song:2013:DIE, author = "Dezhao Song and Jeff Heflin", title = "Domain-Independent Entity Coreference for Linking Ontology Instances", journal = j-JDIQ, volume = "4", number = "2", pages = "7:1--7:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2435221.2435223", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:00 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The objective of entity coreference is to determine if different mentions (e.g., person names, place names, database records, ontology instances, etc.) refer to the same real word object. Entity coreference algorithms can be used to detect duplicate database records and to determine if two Semantic Web instances represent the same underlying real word entity. The key issues in developing an entity coreference algorithm include how to locate context information and how to utilize the context appropriately. In this article, we present a novel entity coreference algorithm for ontology instances. For scalability reasons, we select a neighborhood of each instance from an RDF graph. To determine the similarity between two instances, our algorithm computes the similarity between comparable property values in the neighborhood graphs. The similarity of distinct URIs and blank nodes is computed by comparing their outgoing links. In an attempt to reduce the impact of distant nodes on the final similarity measure, we explore a distance-based discounting approach. To provide the best possible domain-independent matches, we propose an approach to compute the discriminability of triples in order to assign weights to the context information. We evaluated our algorithm using different instance categories from five datasets. Our experiments show that the best results are achieved by including both our discounting and triple discrimination approaches.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Nuray-Turan:2013:ACS, author = "Rabia Nuray-Turan and Dmitri V. Kalashnikov and Sharad Mehrotra", title = "Adaptive Connection Strength Models for Relationship-Based Entity Resolution", journal = j-JDIQ, volume = "4", number = "2", pages = "8:1--8:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2435221.2435224", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:00 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Entity Resolution (ER) is a data quality challenge that deals with ambiguous references in data and whose task is to identify all references that co-refer. Due to practical significance of the ER problem, many creative ER techniques have been proposed in the past, including those that analyze relationships that exist among entities in data. Such approaches view the database as an entity-relationship graph, where direct and indirect relationships correspond to paths in the graph. These techniques rely on measuring the connection strength among various nodes in the graph by using a connection strength (CS) model. While such approaches have demonstrated significant advantage over traditional ER techniques, currently they also have a significant limitation: the CS models that they use are intuition-based fixed models that tend to behave well in general, but are very generic and not tuned to a specific domain, leading to suboptimal result quality. Hence, in this article we propose an approach that employs supervised learning to adapt the connection strength measure to the given domain using the available past/training data. The adaptive approach has several advantages: it increases both the quality and efficiency of ER and it also minimizes the domain analyst participation needed to tune the CS model to the given domain. The extensive empirical evaluation demonstrates that the proposed approach reaches up to 8\% higher accuracy than the graph-based ER methods that use fixed and intuition-based CS models.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Panse:2013:IHU, author = "Fabian Panse and Maurice van Keulen and Norbert Ritter", title = "Indeterministic Handling of Uncertain Decisions in Deduplication", journal = j-JDIQ, volume = "4", number = "2", pages = "9:1--9:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2435221.2435225", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:00 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In current research and practice, deduplication is usually considered as a deterministic approach in which database tuples are either declared to be duplicates or not. In ambiguous situations, however, it is often not completely clear-cut, which tuples represent the same real-world entity. In deterministic approaches, many realistic possibilities may be ignored, which in turn can lead to false decisions. In this article, we present an indeterministic approach for deduplication by using a probabilistic target model including techniques for proper probabilistic interpretation of similarity matching results. Thus, instead of deciding for one of the most likely situations, all realistic situations are modeled in the resultant data. This approach minimizes the negative impact of false decisions. Moreover, the deduplication process becomes almost fully automatic and human effort can be largely reduced. To increase applicability, we introduce several semi-indeterministic methods that heuristically reduce the set of indeterministically handled decisions in several meaningful ways. We also describe a full-indeterministic method for theoretical and presentational reasons.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Zhou:2013:GLC, author = "Yinle Zhou and Eric Nelson and Fumiko Kobayashi and John R. Talburt", title = "A Graduate-Level Course on Entity Resolution and Information Quality: a Step toward {ER} Education", journal = j-JDIQ, volume = "4", number = "2", pages = "10:1--10:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2435221.2435226", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:00 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This article discusses the topics, approaches, and lessons learned in teaching a graduate-level course covering entity resolution (ER) and its relationship to information quality (IQ). The course surveys a broad spectrum of ER topics and activities including entity reference extraction, entity reference preparation, entity reference resolution techniques, entity identity management, and entity relationship analysis. The course content also attempts to balance aspects of ER theory with practical application through a series of laboratory exercises coordinated with the lecture topics. As an additional teaching aid, a configurable, open-source entity resolution engine (OYSTER) was developed that allows students to experience with different types of ER architectures including merge-purge, record linking, identity resolution, and identity capture.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Cao:2013:NAD, author = "Lan Cao and Hongwei Zhu", title = "Normal accidents: Data quality problems in {ERP}-enabled manufacturing", journal = j-JDIQ, volume = "4", number = "3", pages = "11:1--11:??", month = may, year = "2013", CODEN = "????", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:05 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The efficient operation of Enterprise Resource Planning (ERP) systems largely depends on data quality. ERP can improve data quality and information sharing within an organization. It can also pose challenges to data quality. While it is well known that data quality is important in ERP systems, most existing research has focused on identifying the factors affecting the implementation and the business values of ERP. With normal accident theory as a theoretical lens, we examine data quality problems in ERP using a case study of a large, fast-growing multinational manufacturer headquartered in China. Our findings show that organizations that have successfully implemented ERP can still experience certain data quality problems. We identify major data quality problems in data production, storage and maintenance, and utilization processes. We also analyze the causes of these data quality problems by linking them to certain characteristics of ERP systems within an organizational context. Our analysis shows that problems resulting from the tight coupling effects and the complexity of ERP-enabled manufacturing systems can be inevitable. This study will help researchers and practitioners formulate data management strategies that are effective in the presence of certain ``normal'' data quality problems.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Biran:2013:CII, author = "Dov Biran and Michael H. Zack and Richard J. Briotta", title = "Competitive intelligence and information quality: a game-theoretic perspective", journal = j-JDIQ, volume = "4", number = "3", pages = "12:1--12:??", month = may, year = "2013", CODEN = "????", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:05 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "To better understand a competitor's tactical and strategic plans, companies need to take a closer look at competitive intelligence or they risk missing lucrative opportunities. Because of this there is a growing interest in competitive intelligence and intelligence information gathering systems (IIS). This article uses game-theoretic concepts to develop an analytic framework to assess the value of deploying a competitive intelligence gathering information system. Modeling the competitive environment as a game provides a useful approach to study and evaluate competitive strategies given diverse assumptions about the quality of the information known by the players. When determining the value of deploying an IIS, decision makers need to examine three components of the competitive environment: the competitive rules of the game, the state of player knowledge, and the reliability of the information gathered. This framework focuses on competitive environments where the players' state of knowledge (i.e., common versus covert knowledge) and the reliability of the information generated are essential to the decision making process. The article concludes with implications for research and practice.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Joglekar:2013:AAD, author = "Nitin R. Joglekar and Edward G. Anderson and G. Shankaranarayanan", title = "Accuracy of aggregate data in distributed project settings: Model, analysis and implications", journal = j-JDIQ, volume = "4", number = "3", pages = "13:1--13:??", month = may, year = "2013", CODEN = "????", ISSN = "1936-1955", bibdate = "Sat Jun 22 12:13:05 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "We examine the management of data accuracy in inter-organizational data exchanges using the context of distributed software projects. Organizations typically manage projects by outsourcing portions of the project to partners. Managing a portfolio of such projects requires sharing data regarding the status of work-in-progress residing with the partners and estimates of these projects' completion times. Portfolio managers use these data to assign projects to be outsourced to partners. These data are rarely accurate. Unless these data are filtered, inaccuracies can lead to myopic and expensive sourcing decisions. We develop a model that uses project-status data to identify an optimal assignment of projects to be outsourced. This model permits corruption of project-status data. We use this model to compute the costs of using perfect versus inaccurate project-status data and show that the costs of deviation from optimal are sizable when the inaccuracy in the data is significant. We further propose a filter to correct inaccurate project-status data and generate an estimate of true progress. With this filter, depending on the relative magnitudes of errors, we show that accuracy of project-status data can be improved and the associated economic benefit is significant. We illustrate the improvement in accuracy and associated economic benefit by instantiating the model and the filter. We further elaborate on how the model parameters may be estimated and used in practice.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Raschid:2014:E, author = "Louiqa Raschid", title = "Editorial", journal = j-JDIQ, volume = "4", number = "4", pages = "14:1--14:??", month = may, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579167", ISSN = "1936-1955", bibdate = "Tue May 27 16:54:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Wijnhoven:2014:VBF, author = "Fons Wijnhoven and Chintan Amrit and Pim Dietz", title = "Value-Based File Retention: File Attributes as File Value and Information Waste Indicators", journal = j-JDIQ, volume = "4", number = "4", pages = "15:1--15:??", month = may, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567656", ISSN = "1936-1955", bibdate = "Tue May 27 16:54:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Several file retention policy methods propose that a file retention policy should be based on file value. Though such a retention policy might increase the value of accessible files, the method to arrive at such a policy is under-researched. This article discusses how one can arrive at a method for developing file retention policies based on the use values of files. The method's applicability is initially assessed through a case study at Capgemini, Netherlands. In the case study, we hypothesize that one can develop a file retention policy by testing causal relations between file attributes (as used by file retention methods) and the use value of files. Unfortunately, most file attributes used by file retention methods have a weak correlation with file value, resulting in the conclusion that these methods do not well select out high- and low-value files. This would imply the ineffectiveness of the used attributes in our study or errors in our conceptualization of file value. We continue with the last possibility and develop indicators for file utility (with low utility being waste). With this approach we were able to detect waste files, in a sample of files, with an accuracy of 80\%. We therefore not only suggest further research in information waste detection as part of a file retention policy, but also to further explore other file attributes that could better predict file value and file utility.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Fan:2014:IBR, author = "Wenfei Fan and Shuai Ma and Nan Tang and Wenyuan Yu", title = "Interaction between Record Matching and Data Repairing", journal = j-JDIQ, volume = "4", number = "4", pages = "16:1--16:??", month = may, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567657", ISSN = "1936-1955", bibdate = "Tue May 27 16:54:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Central to a data cleaning system are record matching and data repairing. Matching aims to identify tuples that refer to the same real-world object, and repairing is to make a database consistent by fixing errors in the data by using integrity constraints. These are typically treated as separate processes in current data cleaning systems, based on heuristic solutions. This article studies a new problem in connection with data cleaning, namely the interaction between record matching and data repairing. We show that repairing can effectively help us identify matches, and vice versa. To capture the interaction, we provide a uniform framework that seamlessly unifies repairing and matching operations to clean a database based on integrity constraints, matching rules, and master data. We give a full treatment of fundamental problems associated with data cleaning via matching and repairing, including the static analyses of constraints and rules taken together, and the complexity, termination, and determinism analyses of data cleaning. We show that these problems are hard, ranging from NP-complete or coNP-complete, to PSPACE-complete. Nevertheless, we propose efficient algorithms to clean data via both matching and repairing. The algorithms find deterministic fixes and reliable fixes based on confidence and entropy analyses, respectively, which are more accurate than fixes generated by heuristics. Heuristic fixes are produced only when deterministic or reliable fixes are unavailable. We experimentally verify that our techniques can significantly improve the accuracy of record matching and data repairing that are taken as separate processes, using real-life and synthetic data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Martin:2014:MAE, author = "Nigel Martin and Alexandra Poulovassilis and Jianing Wang", title = "A Methodology and Architecture Embedding Quality Assessment in Data Integration", journal = j-JDIQ, volume = "4", number = "4", pages = "17:1--17:??", month = may, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2567663", ISSN = "1936-1955", bibdate = "Tue May 27 16:54:25 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data integration aims to combine heterogeneous information sources and to provide interfaces for accessing the integrated resource. Data integration is a collaborative task that may involve many people with different degrees of experience, knowledge of the application domain, and expectations relating to the integrated resource. It may be difficult to determine and control the quality of an integrated resource due to these factors. In this article, we propose a data integration methodology that has embedded within it iterative quality assessment and improvement of the integrated resource. We also propose an architecture for the realisation of this methodology. The quality assessment is based on an ontology representation of different users' quality requirements and of the main elements of the integrated resource. We use description logic as the formal basis for reasoning about users' quality requirements and for validating that an integrated resource satisfies these requirements. We define quality factors and associated metrics which enable the quality of alternative global schemas for an integrated resource to be assessed quantitatively, and hence the improvement which results from the refinement of a global schema following our methodology to be measured. We evaluate our approach through a large-scale real-life case study in biological data integration in which an integrated resource is constructed from three autonomous proteomics data sources.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Naumann:2014:E, author = "Felix Naumann", title = "Editorial", journal = j-JDIQ, volume = "5", number = "1--2", pages = "1:1--1:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2648781", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Talburt:2014:IQR, author = "John Talburt and Therese L. Williams and Thomas C. Redman and David Becker", title = "Information quality research challenge: Predicting and quantifying the impact of social issues on information quality programs", journal = j-JDIQ, volume = "5", number = "1--2", pages = "2:1--2:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629603", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Rahm:2014:DPC, author = "Erhard Rahm", title = "Discovering product counterfeits in online shops: a big data integration challenge", journal = j-JDIQ, volume = "5", number = "1--2", pages = "3:1--3:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629605", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Christen:2014:CPP, author = "Peter Christen and Dinusha Vatsalan and Vassilios S. Verykios", title = "Challenges for privacy preservation in data integration", journal = j-JDIQ, volume = "5", number = "1--2", pages = "4:1--4:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629604", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Techniques for integrating data from diverse sources have attracted significant interest in recent years. Much of today's data collected by businesses and governments are about people, and integrating such data across organizations can raise privacy concerns. Various techniques that preserve privacy during data integration have been developed, but several challenges persist that need to be solved before such techniques become useful in practical applications. We elaborate on these challenges and discuss research directions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Vogel:2014:RGA, author = "Tobias Vogel and Arvid Heise and Uwe Draisbach and Dustin Lange and Felix Naumann", title = "Reach for gold: an annealing standard to evaluate duplicate detection results", journal = j-JDIQ, volume = "5", number = "1--2", pages = "5:1--5:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629687", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Duplicates in a database are one of the prime causes of poor data quality and are at the same time among the most difficult data quality problems to alleviate. To detect and remove such duplicates, many commercial and academic products and methods have been developed. The evaluation of such systems is usually in need of pre-classified results. Such gold standards are often expensive to come by (much manual classification is necessary), not representative (too small or too synthetic), and proprietary and thus preclude repetition (company-internal data). This lament has been uttered in many papers and even more paper reviews. The proposed annealing standard is a structured set of duplicate detection results, some of which are manually verified and some of which are merely validated by many classifiers. As more and more classifiers are evaluated against the annealing standard, more and more results are verified and validation becomes more and more confident. We formally define gold, silver, and the annealing standard and their maintenance. Experiments show how quickly an annealing standard converges to a gold standard. Finally, we provide an annealing standard for 750,000 CDs to the duplicate detection community.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Fan:2014:CRD, author = "Wenfei Fan and Floris Geerts and Nan Tang and Wenyuan Yu", title = "Conflict resolution with data currency and consistency", journal = j-JDIQ, volume = "5", number = "1--2", pages = "6:1--6:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2631923", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This article introduces a new approach for conflict resolution: given a set of tuples pertaining to the same entity, it identifies a single tuple in which each attribute has the latest and consistent value in the set. This problem is important in data integration, data cleaning, and query answering. It is, however, challenging since in practice, reliable time stamps are often absent, among other things. We propose a model for conflict resolution by specifying data currency in terms of partial currency orders and currency constraints and by enforcing data consistency with constant conditional functional dependencies. We show that identifying data currency orders helps us repair inconsistent data, and vice versa. We investigate a number of fundamental problems associated with conflict resolution and establish their complexity. In addition, we introduce a framework and develop algorithms for conflict resolution by integrating data currency and consistency inferences into a single process and by interacting with users. We experimentally verify the accuracy and efficiency of our methods using real-life and synthetic data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Glowalla:2014:PDD, author = "Paul Glowalla and Ali Sunyaev", title = "Process-driven data quality management: a critical review on the application of process modeling languages", journal = j-JDIQ, volume = "5", number = "1--2", pages = "7:1--7:??", month = aug, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629568", ISSN = "1936-1955", bibdate = "Mon Sep 8 08:45:58 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data quality is critical to organizational success. In order to improve and sustain data quality in the long term, process-driven data quality management (PDDQM) seeks to redesign processes that create or modify data. Consequently, process modeling is mandatory for PDDQM. Current research examines process modeling languages with respect to representational capabilities. However, there is a gap, since process modeling languages for PDDQM are not considered. We address this research gap by providing a synthesis of the varying applications of process modeling languages for PDDQM. We conducted a keyword-based literature review in conferences as well as 74 highranked information systems and computer science journals, reviewing 1,555 articles from 1995 onwards. For practitioners, it is possible to integrate the quality perspective within broadly applied process models. For further research, we derive representational requirements for PDDQM that should be integrated within existing process modeling languages. However, there is a need for further representational analysis to examine the adequacy of upcoming process modeling languages. New or enhanced process modeling languages may substitute for PDDQM-specific process modeling languages and facilitate development of a broadly applicable and accepted process modeling language for PDDQM.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Belhajjame:2015:E, author = "Khalid Belhajjame and Domenico Beneventano and Laure Berti-Equille and James Cheney and Victor Cuevas and Tom {De Nies} and Helena Galhardas and Ashish Gehani and Boris Glavic and Paul Groth and Olaf Hartig and Scott Jensen and Andrea Maurino and Gianni Mecca and Renee Miller and Luc Moreau and Mourad Ouzzani and Jaehong Park", title = "Editorial", journal = j-JDIQ, volume = "5", number = "3", pages = "8:1--8:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2692312", ISSN = "1936-1955", bibdate = "Tue Mar 3 14:42:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Cheah:2015:PQA, author = "You-Wei Cheah and Beth Plale", title = "Provenance Quality Assessment Methodology and Framework", journal = j-JDIQ, volume = "5", number = "3", pages = "9:1--9:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2665069", ISSN = "1936-1955", bibdate = "Tue Mar 3 14:42:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data provenance, a form of metadata describing the life cycle of a data product, is crucial in the sharing of research data. Research data, when shared over decades, requires recipients to make a determination of both use and trust. That is, can they use the data? More importantly, can they trust it? Knowing the data are of high quality is one factor to establishing fitness for use and trust. Provenance can be used to assert the quality of the data, but the quality of the provenance must be known as well. We propose a framework for assessing the quality of data provenance. We identify quality issues in data provenance, establish key quality dimensions, and define a framework of analysis. We apply the analysis framework to synthetic and real-world provenance.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Herschel:2015:HAA, author = "Melanie Herschel", title = "A Hybrid Approach to Answering Why-Not Questions on Relational Query Results", journal = j-JDIQ, volume = "5", number = "3", pages = "10:1--10:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2665070", ISSN = "1936-1955", bibdate = "Tue Mar 3 14:42:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In analyzing and debugging data transformations, or more specifically relational queries, a subproblem is to understand why some data are not part of the query result. This problem has recently been addressed from different perspectives for various fragments of relational queries. The different perspectives yield different yet complementary explanations of such missing answers. This article first aims at unifying the different approaches by defining a new type of explanation, called hybrid explanation, that encompasses the variety of previously defined types of explanations. This solution goes beyond simply forming the union of explanations produced by different algorithms and is shown to be able to explain a larger set of missing answers. Second, we present Conseil, an algorithm to generate hybrid explanations. Conseil is also the first algorithm to handle nonmonotonic queries. Experiments on efficiency and explanation quality show that Conseil is comparable and even outperforms previous algorithms. This article extends a previous short conference paper by providing proofs, additional theorems, and a detailed discussion of each step of the Conseil algorithm. It also significantly extends the experimental evaluation on efficiency and explanation quality.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chong:2015:SID, author = "Stephen Chong and Christian Skalka and Jeffrey A. Vaughan", title = "Self-Identifying Data for Fair Use", journal = j-JDIQ, volume = "5", number = "3", pages = "11:1--11:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687422", ISSN = "1936-1955", bibdate = "Tue Mar 3 14:42:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Public-use earth science datasets are a useful resource with the unfortunate feature that their provenance is easily disconnected from their content. ``Fair-use policies'' typically associated with these datasets require appropriate attribution of providers by users, but sound and complete attribution is difficult if provenance information is lost. To address this, we introduce a technique to directly associate provenance information with sensor datasets. Our technique is similar to traditional watermarking but is intended for application to unstructured time-series datasets. Our approach is potentially imperceptible given sufficient margins of error in datasets and is robust to a number of benign but likely transformations including truncation, rounding, bit-flipping, sampling, and reordering. We provide algorithms for both one-bit and blind mark checking and show how our system can be adapted to various data representation types. Our algorithms are probabilistic in nature and are characterized by both combinatorial and empirical analyses. Mark embedding can be applied at any point in the data life cycle, allowing adaptation of our scheme to social or scientific concerns.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Baillie:2015:QPA, author = "Chris Baillie and Peter Edwards and Edoardo Pignotti", title = "{QUAL}: a Provenance-Aware Quality Model", journal = j-JDIQ, volume = "5", number = "3", pages = "12:1--12:??", month = feb, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700413", ISSN = "1936-1955", bibdate = "Tue Mar 3 14:42:39 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In this article, we present a model for quality assessment over linked data. This model has been designed to align with emerging standards for provenance on the Web to enable agents to reason about data provenance when performing quality assessment. The model also enables quality assessment provenance to be represented, thus allowing agents to make decisions about reuse of existing assessments. We also discuss the development of an OWL ontology as part of a software framework to support reasoning about data quality and assessment reuse. Finally, we evaluate this framework using two real-world case studies derived from transport and invasive-species monitoring applications.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Attenberg:2015:BMC, author = "Joshua Attenberg and Panos Ipeirotis and Foster Provost", title = "Beat the Machine: Challenging Humans to Find a Predictive Model's ``Unknown Unknowns''", journal = j-JDIQ, volume = "6", number = "1", pages = "1:1--1:??", month = mar, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700832", ISSN = "1936-1955", bibdate = "Thu Mar 5 07:53:50 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "We present techniques for gathering data that expose errors of automatic predictive models. In certain common settings, traditional methods for evaluating predictive models tend to miss rare but important errors --- most importantly, cases for which the model is confident of its prediction (but wrong). In this article, we present a system that, in a game-like setting, asks humans to identify cases that will cause the predictive model-based system to fail. Such techniques are valuable in discovering problematic cases that may not reveal themselves during the normal operation of the system and may include cases that are rare but catastrophic. We describe the design of the system, including design iterations that did not quite work. In particular, the system incentivizes humans to provide examples that are difficult for the model to handle by providing a reward proportional to the magnitude of the predictive model's error. The humans are asked to ``Beat the Machine'' and find cases where the automatic model (``the Machine'') is wrong. Experiments show that the humans using Beat the Machine identify more errors than do traditional techniques for discovering errors in predictive models, and, indeed, they identify many more errors where the machine is (wrongly) confident it is correct. Furthermore, those cases the humans identify seem to be not simply outliers, but coherent areas missed completely by the model. Beat the Machine identifies the ``unknown unknowns.'' Beat the Machine has been deployed at an industrial scale by several companies. The main impact has been that firms are changing their perspective on and practice of evaluating predictive models. ``There are known knowns. These are things we know that we know. There are known unknowns. That is to say, there are things that we know we don't know. But there are also unknown unknowns. There are things we don't know we don't know.'' --- Donald Rumsfeld", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Alonso:2015:CLQ, author = "Omar Alonso", title = "Challenges with Label Quality for Supervised Learning", journal = j-JDIQ, volume = "6", number = "1", pages = "2:1--2:??", month = mar, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2724721", ISSN = "1936-1955", bibdate = "Thu Mar 5 07:53:50 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Organizations that develop and use technologies around information retrieval, machine learning, recommender systems, and natural language processing depend on labels for engineering and experimentation. These labels, usually gathered via human computation, are used in machine-learned models for prediction and evaluation purposes. In such scenarios, collecting high-quality labels is a very important part of the overall process. We elaborate on these challenges and discuss research directions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Lukyanenko:2015:IQR, author = "Roman Lukyanenko and Jeffrey Parsons", title = "Information Quality Research Challenge: Adapting Information Quality Principles to User-Generated Content", journal = j-JDIQ, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2723166", ISSN = "1936-1955", bibdate = "Thu Mar 5 07:53:50 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Naumann:2015:E, author = "Felix Naumann", title = "Editorial", journal = j-JDIQ, volume = "6", number = "2--3", pages = "4:1--4:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2762716", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Varshney:2015:DCD, author = "Kush R. Varshney and Dennis Wei and Karthikeyan Natesan Ramamurthy and Aleksandra Mojsilovi{\'c}", title = "Data Challenges in Disease Response: The 2014 {Ebola} Outbreak and Beyond", journal = j-JDIQ, volume = "6", number = "2--3", pages = "5:1--5:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742550", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Barnaghi:2015:CQD, author = "Payam Barnaghi and Maria Bermudez-Edo and Ralf T{\"o}njes", title = "Challenges for Quality of Data in Smart Cities", journal = j-JDIQ, volume = "6", number = "2--3", pages = "6:1--6:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2747881", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Grant:2015:CLT, author = "Christan Earl Grant and Daisy Zhe Wang", title = "A Challenge for Long-Term Knowledge Base Maintenance", journal = j-JDIQ, volume = "6", number = "2--3", pages = "7:1--7:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2738044", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Sha:2015:DQC, author = "Kewei Sha and Sherali Zeadally", title = "Data Quality Challenges in Cyber-Physical Systems", journal = j-JDIQ, volume = "6", number = "2--3", pages = "8:1--8:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2740965", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Gennari:2015:CQT, author = "Rosella Gennari and Sara Tonelli and Pierpaolo Vittorini", title = "Challenges in Quality of Temporal Data --- Starting with Gold Standards", journal = j-JDIQ, volume = "6", number = "2--3", pages = "9:1--9:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2736699", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Basole:2015:DAC, author = "Rahul C. Basole and Mark L. Braunstein and Jimeng Sun", title = "Data and Analytics Challenges for a Learning Healthcare System", journal = j-JDIQ, volume = "6", number = "2--3", pages = "10:1--10:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2755489", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Todoran:2015:MEI, author = "Ion-George Todoran and Laurent Lecornu and Ali Khenchaf and Jean-Marc {Le Caillec}", title = "A Methodology to Evaluate Important Dimensions of Information Quality in Systems", journal = j-JDIQ, volume = "6", number = "2--3", pages = "11:1--11:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2744205", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Assessing the quality of the information proposed by an information system has become one of the major research topics in the last two decades. A quick literature survey shows that a significant number of information quality frameworks are proposed in different domains of application: management information systems, web information systems, information fusion systems, and so forth. Unfortunately, they do not provide a feasible methodology that is both simple and intuitive to be implemented in practice. In order to address this need, we present in this article a new information quality methodology. Our methodology makes use of existing frameworks and proposes a three-step process capable of tracking the quality changes through the system. In the first step and as a novelty compared to existing studies, we propose decomposing the information system into its elementary modules. Having access to each module allows us to locally define the information quality. Then, in the second step, we model each processing module by a quality transfer function, capturing the module's influence over the information quality. In the third step, we make use of the previous two steps in order to estimate the quality of the entire information system. Thus, our methodology allows informing the end-user on both output quality and local quality. The proof of concept of our methodology has been carried out considering two applications: an automatic target recognition system and a diagnosis coding support system.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Zarraga-Rodriguez:2015:EID, author = "Marta Zarraga-Rodriguez and M. Jesus Alvarez", title = "Experience: Information Dimensions Affecting Employees' Perceptions Towards Being Well Informed", journal = j-JDIQ, volume = "6", number = "2--3", pages = "12:1--12:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2774223", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Information is a strategic company resource, but there is no consensus in the literature regarding the set of dimensions to be considered when measuring the quality of the information. Most measures of information quality depend on user perception. Using multiple correlation analysis, we obtain a model that allows us to explain how information quality dimensions influence information consumers' overall feeling of being well informed. A set of dimensions that any measure of information quality should at least include is proposed. This exploratory study reports the results of a research survey among managers of companies committed to quality management within the framework of a Total Quality Management (TQM) model, which is an information-intensive management model.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bartoli:2015:DQC, author = "Alberto Bartoli and Andrea {De Lorenzo} and Eric Medvet and Fabiano Tarlao", title = "Data Quality Challenge: Toward a Tool for String Processing by Examples", journal = j-JDIQ, volume = "6", number = "4", pages = "13:1--13:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2786983", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ahlers:2015:DCQ, author = "Dirk Ahlers and John Krogstie", title = "Document and Corpus Quality Challenges for Knowledge Management in Engineering Enterprises", journal = j-JDIQ, volume = "6", number = "4", pages = "14:1--14:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2818379", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ramadan:2015:DSN, author = "Banda Ramadan and Peter Christen and Huizhi Liang and Ross W. Gayler", title = "Dynamic Sorted Neighborhood Indexing for Real-Time Entity Resolution", journal = j-JDIQ, volume = "6", number = "4", pages = "15:1--15:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2816821", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Real-time Entity Resolution (ER) is the process of matching query records in subsecond time with records in a database that represent the same real-world entity. Indexing techniques are generally used to efficiently extract a set of candidate records from the database that are similar to a query record, and that are to be compared with the query record in more detail. The sorted neighborhood indexing method, which sorts a database and compares records within a sliding window, has been successfully used for ER of large static databases. However, because it is based on static sorted arrays and is designed for batch ER that resolves all records in a database rather than resolving those relating to a single query record, this technique is not suitable for real-time ER on dynamic databases that are constantly updated. We propose a tree-based technique that facilitates dynamic indexing based on the sorted neighborhood method, which can be used for real-time ER, and investigate both static and adaptive window approaches. We propose an approach to reduce query matching times by precalculating the similarities between attribute values stored in neighboring tree nodes. We also propose a multitree solution where different sorting keys are used to reduce the effects of errors and variations in attribute values on matching quality by building several distinct index trees. We experimentally evaluate our proposed techniques on large real datasets, as well as on synthetic data with different data quality characteristics. Our results show that as the index grows, no appreciable increase occurs in both record insertion and query times, and that using multiple trees gives noticeable improvements on matching quality with only a small increase in query time. Compared to earlier indexing techniques for real-time ER, our approach achieves significantly reduced indexing and query matching times while maintaining high matching accuracy.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Coletti:2015:DCH, author = "Paolo Coletti and Maurizio Murgia", title = "Design and Construction of a Historical Financial Database of the {Italian} Stock Market 1973--2011", journal = j-JDIQ, volume = "6", number = "4", pages = "16:1--16:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2822898", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This article presents the technical aspects of designing and building a historical database of the Italian Stock Market. The database contains daily market data from 1973 to 2011 and is constructed by merging two main digital sources and several other hand-collected data sources. We analyzed and developed semiautomatic tools to deal with problems related to time-series matchings, quality of data, and numerical errors. We also developed a concatenation structure to allow the handling of company name changes, mergers, and spin-offs without artificially altering numerical series. At the same time, we maintained the transparency of the historical information on each individual company listed. Thanks to the overlapping of digital and hand-collected data, the completed database has a very high level of detail and accuracy. The dataset is particularly suited for any empirical research in financial economics and for more practically oriented numerical applications and forecasting simulations.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Missier:2015:CSI, author = "Paolo Missier", title = "Corrigendum to the Special Issue Editorial in {JDIQ} Volume 5, Issue 3", journal = j-JDIQ, volume = "6", number = "4", pages = "17:1--17:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2821019", ISSN = "1936-1955", bibdate = "Tue Oct 27 22:10:29 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chapman:2016:CQD, author = "Adriane P. Chapman and Arnon Rosenthal and Len Seligman", title = "The Challenge of ``Quick and Dirty'' Information Quality", journal = j-JDIQ, volume = "7", number = "1--2", pages = "1:1--1:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2834123", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Millar:2016:DQC, author = "Jeremy R. Millar and Douglas D. Hodson and Gilbert L. Peterson and Darryl K. Ahner", title = "Data Quality Challenges in Distributed Live-Virtual-Constructive Test Environments", journal = j-JDIQ, volume = "7", number = "1--2", pages = "2:1--2:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2850420", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Lukyanenko:2016:IQR, author = "Roman Lukyanenko", title = "Information Quality Research Challenge: Information Quality in the Age of Ubiquitous Digital Intermediation", journal = j-JDIQ, volume = "7", number = "1--2", pages = "3:1--3:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2856038", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "As information technology becomes an integral part of daily life, increasingly, people understand the world around them by turning to digital sources as opposed to directly interacting with objects in the physical world. This has ushered in the age of Ubiquitous Digital Intermediation (UDI). With the explosion of UDI, the scope of Information Quality (IQ) research is due to expand dramatically as the challenge becomes to capture the wealth and nuances of human experience. This article presents three key changes to the IQ landscape brought about by UDI, including expansion of the scope of traditional IQ dimensions, digital to physical mapping challenge, and the increased need to manage content authenticity. UDI generates many novel questions and opportunities for the IQ research community.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Zhu:2016:DSC, author = "Hongwei Zhu and Yang W. Lee and Arnon S. Rosenthal", title = "Data Standards Challenges for Interoperable and Quality Data", journal = j-JDIQ, volume = "7", number = "1--2", pages = "4:1--4:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2903723", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ulbricht:2016:CCD, author = "Robert Ulbricht and Hilko Donker and Claudio Hartmann and Martin Hahmann and Wolfgang Lehner", title = "Challenges for Context-Driven Time Series Forecasting", journal = j-JDIQ, volume = "7", number = "1--2", pages = "5:1--5:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2896822", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Predicting time series is a crucial task for organizations, since decisions are often based on uncertain information. Many forecasting models are designed from a generic statistical point of view. However, each real-world application requires domain-specific adaptations to obtain high-quality results. All such specifics are summarized by the term of context. In contrast to current approaches, we want to integrate context as the primary driver in the forecasting process. We introduce context-driven time series forecasting focusing on two exemplary domains: renewable energy and sparse sales data. In view of this, we discuss the challenge of context integration in the individual process steps.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ceolin:2016:CUR, author = "Davide Ceolin and Paul Groth and Valentina Maccatrozzo and Wan Fokkink and Willem Robert {Van Hage} and Archana Nottamkandath", title = "Combining User Reputation and Provenance Analysis for Trust Assessment", journal = j-JDIQ, volume = "7", number = "1--2", pages = "6:1--6:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818382", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Trust is a broad concept that in many systems is often reduced to user reputation alone. However, user reputation is just one way to determine trust. The estimation of trust can be tackled from other perspectives as well, including by looking at provenance. Here, we present a complete pipeline for estimating the trustworthiness of artifacts given their provenance and a set of sample evaluations. The pipeline is composed of a series of algorithms for (1) extracting relevant provenance features, (2) generating stereotypes of user behavior from provenance features, (3) estimating the reputation of both stereotypes and users, (4) using a combination of user and stereotype reputations to estimate the trustworthiness of artifacts, and (5) selecting sets of artifacts to trust. These algorithms rely on the W3C PROV recommendations for provenance and on evidential reasoning by means of subjective logic. We evaluate the pipeline over two tagging datasets: tags and evaluations from the Netherlands Institute for Sound and Vision's Waisda? video tagging platform, as well as crowdsourced annotations from the Steve.Museum project. The approach achieves up to 85\% precision when predicting tag trustworthiness. Perhaps more importantly, the pipeline provides satisfactory results using relatively little evidence through the use of provenance.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Christen:2016:ADA, author = "Peter Christen and Ross W. Gayler and Khoi-Nguyen Tran and Jeffrey Fisher and Dinusha Vatsalan", title = "Automatic Discovery of Abnormal Values in Large Textual Databases", journal = j-JDIQ, volume = "7", number = "1--2", pages = "7:1--7:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2889311", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Textual databases are ubiquitous in many application domains. Examples of textual data range from names and addresses of customers to social media posts and bibliographic records. With online services, individuals are increasingly required to enter their personal details for example when purchasing products online or registering for government services, while many social network and e-commerce sites allow users to post short comments. Many online sites leave open the possibility for people to enter unintended or malicious abnormal values, such as names with errors, bogus values, profane comments, or random character sequences. In other applications, such as online bibliographic databases or comparative online shopping sites, databases are increasingly populated in (semi-) automatic ways through Web crawls. This practice can result in low quality data being added automatically into a database. In this article, we develop three techniques to automatically discover abnormal (unexpected or unusual) values in large textual databases. Following recent work in categorical outlier detection, our assumption is that ``normal'' values are those that occur frequently in a database, while an individual abnormal value is rare. Our techniques are unsupervised and address the challenge of discovering abnormal values as an outlier detection problem. Our first technique is a basic but efficient q-gram set based technique, the second is based on a probabilistic language model, and the third employs morphological word features to train a one-class support vector machine classifier. Our aim is to investigate and develop techniques that are fast, efficient, and automatic. The output of our techniques can help in the development of rule-based data cleaning and information extraction systems, or be used as training data for further supervised data cleaning procedures. We evaluate our techniques on four large real-world datasets from different domains: two US voter registration databases containing personal details, the 2013 KDD Cup dataset of bibliographic records, and the SNAP Memetracker dataset of phrases from social networking sites. Our results show that our techniques can efficiently and automatically discover abnormal textual values, allowing an organization to conduct efficient data exploration, and improve the quality of their textual databases without the need of requiring explicit training data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Aiken:2016:ESD, author = "Peter Aiken", title = "{EXPERIENCE}: Succeeding at Data Management-{BigCo} Attempts to Leverage Data", journal = j-JDIQ, volume = "7", number = "1--2", pages = "8:1--8:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2893482", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In a manner similar to most organizations, BigCompany (BigCo) was determined to benefit strategically from its widely recognized and vast quantities of data. (U.S. government agencies make regular visits to BigCo to learn from its experiences in this area.) When faced with an explosion in data volume, increases in complexity, and a need to respond to changing conditions, BigCo struggled to respond using a traditional, information technology (IT) project-based approach to address these challenges. As BigCo was not data knowledgeable, it did not realize that traditional approaches could not work. Two full years into the initiative, BigCo was far from achieving its initial goals. How much more time, money, and effort would be required before results were achieved? Moreover, could the results be achieved in time to support a larger, critical, technology-driven challenge that also depended on solving the data challenges? While these questions remain unaddressed, these considerations increase our collective understanding of data assets as separate from IT projects. Only by reconceiving data as a strategic asset can organizations begin to address these new challenges. Transformation to a data-driven culture requires far more than technology, which remains just one of three required ``stool legs'' (people and process being the other two). Seven prerequisites to effectively leveraging data are necessary, but insufficient awareness exists in most organizations-hence, the widespread misfires in these areas, especially when attempting to implement the so-called big data initiatives. Refocusing on foundational data management practices is required for all organizations, regardless of their organizational or data strategies.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chiang:2016:UDC, author = "Fei Chiang and Siddharth Sitaramachandran", title = "Unifying Data and Constraint Repairs", journal = j-JDIQ, volume = "7", number = "3", pages = "9:1--9:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2883616", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Integrity constraints play an important role in data design. However, in an operational database, they may not be enforced for many reasons. Hence, over time, data may become inconsistent with respect to the constraints. To manage this, several approaches have proposed techniques to repair the data by finding minimal or lowest cost changes to the data that make it consistent with the constraints. Such techniques are appropriate for applications where only the data changes, but schemas and their constraints remain fixed. In many modern applications, however, constraints may evolve over time as application or business rules change, as data are integrated with new data sources or as the underlying semantics of the data evolves. In such settings, when an inconsistency occurs, it is no longer clear if there is an error in the data (and the data should be repaired) or if the constraints have evolved (and the constraints should be repaired). In this work, we present a novel unified cost model that allows data and constraint repairs to be compared on an equal footing. We consider repairs over a database that is inconsistent with respect to a set of rules, modeled as functional dependencies (FDs). FDs are the most common type of constraint and are known to play an important role in maintaining data quality. We propose modifications to the data and to the FDs such that the data and the constraints are better aligned. We evaluate the quality and scalability of our repair algorithms over synthetic and real datasets. The results show that our repair algorithms not only scale well for large datasets but also are able to accurately capture and correct inconsistencies and accurately decide when a data repair versus a constraint repair is best.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Maltese:2016:SAC, author = "Vincenzo Maltese and Fausto Giunchiglia", title = "Search and Analytics Challenges in Digital Libraries and Archives", journal = j-JDIQ, volume = "7", number = "3", pages = "10:1--10:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2939377", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Gelernter:2016:COE, author = "J. Gelernter and J. Jha", title = "Challenges in Ontology Evaluation", journal = j-JDIQ, volume = "7", number = "3", pages = "11:1--11:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2935751", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Berti-Equille:2016:VBD, author = "Laure Berti-Equille and Mouhamadou Lamine Ba", title = "Veracity of Big Data: Challenges of Cross-Modal Truth Discovery", journal = j-JDIQ, volume = "7", number = "3", pages = "12:1--12:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2935753", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Haralabopoulos:2016:CIC, author = "Giannis Haralabopoulos and Ioannis Anagnostopoulos and Sherali Zeadally", title = "The Challenge of Improving Credibility of User-Generated Content in Online Social Networks", journal = j-JDIQ, volume = "7", number = "3", pages = "13:1--13:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2899003", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In every environment of information exchange, Information Quality (IQ) is considered one of the most important issues. Studies in Online Social Networks (OSNs) analyze a number of related subjects that span both theoretical and practical aspects, from data quality identification and simple attribute classification to quality assessment models for various social environments. Among several factors that affect information quality in online social networks is the credibility of user-generated content. To address this challenge, some proposed solutions include community-based evaluation and labeling of user-generated content in terms of accuracy, clarity, and timeliness, along with well-established real-time data mining techniques.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{DUrso:2016:EGD, author = "Ciro D'Urso", title = "{EXPERIENCE}: Glitches in Databases, How to Ensure Data Quality by Outlier Detection Techniques", journal = j-JDIQ, volume = "7", number = "3", pages = "14:1--14:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2950109", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:26 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Enterprise's archives are inevitably affected by the presence of data quality problems (also called glitches). This article proposes the application of a new method to analyze the quality of datasets stored in the tables of a database, with no knowledge of the semantics of the data and without the need to define repositories of rules. The proposed method is based on proper revisions of different approaches for outlier detection that are combined to boost overall performance and accuracy. A novel transformation algorithm is conceived that treats the items in database tables as data points in real coordinate space of n dimensions, so that fields containing dates and fields containing text are processed to calculate distances between those data points. The implementation of an iterative approach ensures that global and local outliers are discovered even if they are subject, primarily in datasets with multiple outliers or clusters of outliers, to masking and swamping effects. The application of the method to a set of archives, some of which have been studied extensively in the literature, provides very promising experimental results and outperforms the application of a single other technique. Finally, a list of future research directions is highlighted.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Labouseur:2017:IDD, author = "Alan G. Labouseur and Carolyn C. Matheus", title = "An Introduction to Dynamic Data Quality Challenges", journal = j-JDIQ, volume = "8", number = "2", pages = "6:1--6:??", month = feb, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2998575", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:27 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Becker:2017:CTD, author = "Christoph Becker and Kresimir Duretec and Andreas Rauber", title = "The Challenge of Test Data Quality in Data Processing", journal = j-JDIQ, volume = "8", number = "2", pages = "7:1--7:??", month = feb, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3012004", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:27 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ferro:2017:RCI, author = "Nicola Ferro", title = "Reproducibility Challenges in Information Retrieval Evaluation", journal = j-JDIQ, volume = "8", number = "2", pages = "8:1--8:??", month = feb, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3020206", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:27 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Shankaranarayanan:2017:CCE, author = "G. Shankaranarayanan and Roger Blake", title = "From Content to Context: The Evolution and Growth of Data Quality Research", journal = j-JDIQ, volume = "8", number = "2", pages = "9:1--9:??", month = feb, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/2996198", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:27 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Research in data and information quality has made significant strides over the last 20 years. It has become a unified body of knowledge incorporating techniques, methods, and applications from a variety of disciplines including information systems, computer science, operations management, organizational behavior, psychology, and statistics. With organizations viewing ``Big Data'', social media data, data-driven decision-making, and analytics as critical, data quality has never been more important. We believe that data quality research is reaching the threshold of significant growth and a metamorphosis from focusing on measuring and assessing data quality-content-toward a focus on usage and context. At this stage, it is vital to understand the identity of this research area in order to recognize its current state and to effectively identify an increasing number of research opportunities within. Using Latent Semantic Analysis (LSA) to analyze the abstracts of 972 peer-reviewed journal and conference articles published over the past 20 years, this article contributes by identifying the core topics and themes that define the identity of data quality research. It further explores their trends over time, pointing to the data quality dimensions that have-and have not-been well-studied, and offering insights into topics that may provide significant opportunities in this area.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Goldberg:2017:PIS, author = "Sean Goldberg and Daisy Zhe Wang and Christan Grant", title = "A Probabilistically Integrated System for Crowd-Assisted Text Labeling and Extraction", journal = j-JDIQ, volume = "8", number = "2", pages = "10:1--10:??", month = feb, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3012003", ISSN = "1936-1955", bibdate = "Sat Apr 8 09:38:27 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The amount of text data has been growing exponentially in recent years, giving rise to automatic information extraction methods that store text annotations in a database. The current state-of-the-art structured prediction methods, however, are likely to contain errors and it is important to be able to manage the overall uncertainty of the database. On the other hand, the advent of crowdsourcing has enabled humans to aid machine algorithms at scale. In this article, we introduce pi-CASTLE, a system that optimizes and integrates human and machine computing as applied to a complex structured prediction problem involving Conditional Random Fields (CRFs). We propose strategies grounded in information theory to select a token subset, formulate questions for the crowd to label, and integrate these labelings back into the database using a method of constrained inference. On both a text segmentation task over academic citations and a named entity recognition task over tweets we show an order of magnitude improvement in accuracy gain over baseline methods.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Woodall:2017:DRC, author = "Philip Woodall", title = "The Data Repurposing Challenge: New Pressures from Data Analytics", journal = j-JDIQ, volume = "8", number = "3--4", pages = "11:1--11:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3022698", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Markovic:2017:CQS, author = "Milan Markovic and Peter Edwards", title = "The Challenge of Quality in Social Computation", journal = j-JDIQ, volume = "8", number = "3--4", pages = "12:1--12:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3041762", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Al-Hussaini:2017:EIB, author = "Leena Al-Hussaini", title = "Experience: Insights into the Benchmarking Data of {Hunspell} and {Aspell} Spell Checkers", journal = j-JDIQ, volume = "8", number = "3--4", pages = "13:1--13:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3092700", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib; https://www.math.utah.edu/pub/tex/bib/spell.bib", abstract = "Hunspell is a morphological spell checker and automatic corrector for Macintosh 10.6 and later versions. Aspell is a general spell checker and automatic corrector for the GNU operating system. In this experience article, we present a benchmarking study of the performance of Hunspell and Aspell. Ginger is a general grammatical spell checker that is used as a baseline to compare the performance of Hunspell and Aspell. A benchmark dataset was carefully selected to be a mixture of different error types at different word length levels. Further, the benchmarking data are from very bad spellers and will challenge any spell checker. The extensive study described in this work will characterize the respective softwares and benchmarking data from multiple perspectives and will consider many error statistics. Overall, Hunspell can correct 415/469 words and Aspell can correct 414/469 words. The baseline Ginger can correct 279/469 words. We recommend this dataset as the preferred benchmark dataset for evaluating newly developed ``isolated word'' spell checkers.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Abdellaoui:2017:QSD, author = "Sabrina Abdellaoui and Fahima Nader and Rachid Chalal", title = "{QDflows}: a System Driven by Knowledge Bases for Designing Quality-Aware Data flows", journal = j-JDIQ, volume = "8", number = "3--4", pages = "14:1--14:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3064173", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In the big data era, data integration is becoming increasingly important. It is usually handled by data flows processes that extract, transform, and clean data from several sources, and populate the data integration system (DIS). Designing data flows is facing several challenges. In this article, we deal with data quality issues such as (1) specifying a set of quality rules, (2) enforcing them on the data flow pipeline to detect violations, and (3) producing accurate repairs for the detected violations. We propose QDflows, a system for designing quality-aware data flows that considers the following as input: (1) a high-quality knowledge base (KB) as the global schema of integration, (2) a set of data sources and a set of validated users' requirements, (3) a set of defined mappings between data sources and the KB, and (4) a set of quality rules specified by users. QDflows uses an ontology to design the DIS schema. It offers the ability to define the DIS ontology as a module of the knowledge base, based on validated users' requirements. The DIS ontology model is then extended with multiple types of quality rules specified by users. QDflows extracts and transforms data from sources to populate the DIS. It detects violations of quality rules enforced on the data flows, constructs repair patterns, searches for horizontal and vertical matches in the knowledge base, and performs an automatic repair when possible or generates possible repairs. It interactively involves users to validate the repair process before loading the clean data into the DIS. Using real-life and synthetic datasets, the DBpedia and Yago knowledge bases, we experimentally evaluate the generality, effectiveness, and efficiency of QDflows. We also showcase an interactive tool implementing our system.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{St-Maurice:2017:ECS, author = "Justin St-Maurice and Catherine Burns", title = "An Exploratory Case Study to Understand Primary Care Users and Their Data Quality Tradeoffs", journal = j-JDIQ, volume = "8", number = "3--4", pages = "15:1--15:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3058750", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Primary care data is an important part of the evolving healthcare ecosystem. Generally, users in primary care are expected to provide excellent patient care and record high-quality data. In practice, users must balance sets of priorities regarding care and data. The goal of this study was to understand data quality tradeoffs between timeliness, validity, completeness, and use among primary care users. As a case study, data quality measures and metrics are developed through a focus group session with managers. After calculating and extracting measurements of data quality from six years of historic data, each measure was modeled with logit binomial regression to show correlations, characterize tradeoffs, and investigate data quality interactions. Measures and correlations for completeness, use, and timeliness were calculated for 196,967 patient encounters. Based on the analysis, there was a positive relationship between validity and completeness, and a negative relationship between timeliness and use. Use of data and reductions in entry delay were positively associated with completeness and validity. Our results suggest that if users are not provided with sufficient time to record data as part of their regular workflow, they will prioritize spending available time with patients. As a measurement of a primary care system's effectiveness, the negative correlation between use and timeliness points to a self-reinforcing relationship that provides users with little external value. In the future, additional data can be generated from comparable organizations to test several new hypotheses about primary care users.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Wang:2017:DDR, author = "Jiannan Wang and Nan Tang", title = "Dependable Data Repairing with Fixing Rules", journal = j-JDIQ, volume = "8", number = "3--4", pages = "16:1--16:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3041761", ISSN = "1936-1955", bibdate = "Mon Oct 2 09:44:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/jdqi/; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "One of the main challenges that data-cleaning systems face is to automatically identify and repair data errors in a dependable manner. Though data dependencies (also known as integrity constraints) have been widely studied to capture errors in data, automated and dependable data repairing on these errors has remained a notoriously difficult problem. In this work, we introduce an automated approach for dependably repairing data errors, based on a novel class of fixing rules. A fixing rule contains an evidence pattern, a set of negative patterns, and a fact value. The heart of fixing rules is deterministic: given a tuple, the evidence pattern and the negative patterns of a fixing rule are combined to precisely capture which attribute is wrong, and the fact indicates how to correct this error. We study several fundamental problems associated with fixing rules and establish their complexity. We develop efficient algorithms to check whether a set of fixing rules are consistent and discuss approaches to resolve inconsistent fixing rules. We also devise efficient algorithms for repairing data errors using fixing rules. Moreover, we discuss approaches on how to generate a large number of fixing rules from examples or available knowledge bases. We experimentally demonstrate that our techniques outperform other automated algorithms in terms of the accuracy of repairing data errors, using both real-life and synthetic data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Marcheggiani:2017:ELQ, author = "Diego Marcheggiani and Fabrizio Sebastiani", title = "On the Effects of Low-Quality Training Data on Information Extraction from Clinical Reports", journal = j-JDIQ, volume = "9", number = "1", pages = "1:1--1:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106235", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In the last five years there has been a flurry of work on information extraction from clinical documents, that is, on algorithms capable of extracting, from the informal and unstructured texts that are generated during everyday clinical practice, mentions of concepts relevant to such practice. Many of these research works are about methods based on supervised learning, that is, methods for training an information extraction system from manually annotated examples. While a lot of work has been devoted to devising learning methods that generate more and more accurate information extractors, no work has been devoted to investigating the effect of the quality of training data on the learning process for the clinical domain. Low quality in training data often derives from the fact that the person who has annotated the data is different from the one against whose judgment the automatically annotated data must be evaluated. In this article, we test the impact of such data quality issues on the accuracy of information extraction systems as applied to the clinical domain. We do this by comparing the accuracy deriving from training data annotated by the authoritative coder (i.e., the one who has also annotated the test data and by whose judgment we must abide) with the accuracy deriving from training data annotated by a different coder, equally expert in the subject matter. The results indicate that, although the disagreement between the two coders (as measured on the training set) is substantial, the difference is (surprisingly enough) not always statistically significant. While the dataset used in the present work originated in a clinical context, the issues we study in this work are of more general interest.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Basheer:2017:CBQ, author = "Aseel Basheer and Kewei Sha", title = "Cluster-Based Quality-Aware Adaptive Data Compression for Streaming Data", journal = j-JDIQ, volume = "9", number = "1", pages = "2:1--2:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3122863", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Wireless sensor networks (WSNs) are widely applied in data collection applications. Energy efficiency is one of the most important design goals of WSNs. In this article, we examine the tradeoffs between the energy efficiency and the data quality. First, four attributes used to evaluate data quality are formally defined. Then, we propose a novel data compression algorithm, Quality-Aware Adaptive data Compression (QAAC), to reduce the amount of data communication to save energy. QAAC utilizes an adaptive clustering algorithm to build clusters from dataset; then a code for each cluster is generated and stored in a Huffman encoding tree. The encoding algorithm encodes the original dataset based on the Haffman encoding tree. An improvement algorithm is also designed to reduce the information loss when data are compressed. After the encoded data, the Huffman encoding tree and parameters used in the improvement algorithm have been received at the sink, a decompression algorithm is used to retrieve the approximation of the original dataset. The performance evaluation shows that QAAC is efficient and achieves a much higher compression ratio than lossy and lossless compression algorithms, while it has much smaller information loss than lossy compression algorithms.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Corsar:2017:COD, author = "David Corsar and Peter Edwards", title = "Challenges of Open Data Quality: More Than Just License, Format, and Customer Support", journal = j-JDIQ, volume = "9", number = "1", pages = "3:1--3:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3110291", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{El-Mawass:2017:DQC, author = "Nour El-Mawass and Saad Alaboodi", title = "Data Quality Challenges in Social Spam Research", journal = j-JDIQ, volume = "9", number = "1", pages = "4:1--4:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3090057", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chen:2017:IQC, author = "Min Chen and Roman Lukyanenko and Monica Chiarini Tremblay", title = "Information Quality Challenges in Shared Healthcare Decision Making", journal = j-JDIQ, volume = "9", number = "1", pages = "5:1--5:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3090056", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Arbuckle:2017:CPC, author = "Peter Arbuckle and Ezra Kahn and Adam Kriesberg", title = "Challenge Paper: Challenges to Sharing Data and Models for Life Cycle Assessment", journal = j-JDIQ, volume = "9", number = "1", pages = "6:1--6:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106236", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:56 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Raschid:2018:ECJ, author = "Louiqa Raschid", title = "{Editor-in-Chief (January 2014--May 2017)} Farewell Report", journal = j-JDIQ, volume = "9", number = "2", pages = "7:1--7:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3143313", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Catarci:2018:FNJ, author = "Tiziana Catarci", title = "Foreword from the New {JDIQ Editor-in-Chief}", journal = j-JDIQ, volume = "9", number = "2", pages = "8:1--8:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3143316", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Truong:2018:CEQ, author = "Hong-Linh Truong and Aitor Murguzur and Erica Yang", title = "Challenges in Enabling Quality of Analytics in the Cloud", journal = j-JDIQ, volume = "9", number = "2", pages = "9:1--9:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3138806", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Koh:2018:ELA, author = "Kyu Han Koh and Eric Fouh and Mohammed F. Farghally and Hossameldin Shahin and Clifford A. Shaffer", title = "Experience: Learner Analytics Data Quality for an {eTextbook} System", journal = j-JDIQ, volume = "9", number = "2", pages = "10:1--10:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3148240", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "We present lessons learned related to data collection and analysis from 5 years of experience with the eTextbook system OpenDSA. The use of such cyberlearning systems is expanding rapidly in both formal and informal educational settings. Although the precise issues related to any such project are idiosyncratic based on the data collection technology and goals of the project, certain types of data collection problems will be common. We begin by describing the nature of the data transmitted between the student's client machine and the database server, and our initial database schema for storing interaction log data. We describe many problems that we encountered, with the nature of the problems categorized as syntactic-level data collection issues, issues with relating events to users, or issues with tracking users over time. Relating events to users and tracking the time spent on tasks are both prerequisites to converting syntactic-level interaction streams to semantic-level behavior needed for higher-order analysis of the data. Finally, we describe changes made to our database schema that helped to resolve many of the issues that we had encountered. These changes help advance our ultimate goal of encouraging a change from ineffective learning behavior by students to more productive behavior.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Cappiello:2018:VDQ, author = "C. Cappiello and C. Cerletti and C. Fratto and B. Pernici", title = "Validating Data Quality Actions in Scoring Processes", journal = j-JDIQ, volume = "9", number = "2", pages = "11:1--11:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3141248", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data quality has gained momentum among organizations upon the realization that poor data quality might cause failures and/or inefficiencies, thus compromising business processes and application results. However, enterprises often adopt data quality assessment and improvement methods based on practical and empirical approaches without conducting a rigorous analysis of the data quality issues and outcome of the enacted data quality improvement practices. In particular, data quality management, especially the identification of the data quality dimensions to be monitored and improved, is performed by knowledge workers on the basis of their skills and experience. Control methods are therefore designed on the basis of expected and evident quality problems; thus, these methods may not be effective in dealing with unknown and/or unexpected problems. This article aims to provide a methodology, based on fault injection, for validating the data quality actions used by organizations. We show how it is possible to check whether the adopted techniques properly monitor the real issues that may damage business processes. At this stage, we focus on scoring processes, i.e., those in which the output represents the evaluation or ranking of a specific object. We show the effectiveness of our proposal by means of a case study in the financial risk management area.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Heinrich:2018:RDQ, author = "Bernd Heinrich and Diana Hristova and Mathias Klier and Alexander Schiller and Michael Szubartowicz", title = "Requirements for Data Quality Metrics", journal = j-JDIQ, volume = "9", number = "2", pages = "12:1--12:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3148238", ISSN = "1936-1955", bibdate = "Mon Jan 22 16:07:57 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data quality and especially the assessment of data quality have been intensively discussed in research and practice alike. To support an economically oriented management of data quality and decision making under uncertainty, it is essential to assess the data quality level by means of well-founded metrics. However, if not adequately defined, these metrics can lead to wrong decisions and economic losses. Therefore, based on a decision-oriented framework, we present a set of five requirements for data quality metrics. These requirements are relevant for a metric that aims to support an economically oriented management of data quality and decision making under uncertainty. We further demonstrate the applicability and efficacy of these requirements by evaluating five data quality metrics for different data quality dimensions. Moreover, we discuss practical implications when applying the presented requirements.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Geerts:2018:ESI, author = "Floris Geerts and Paolo Missier and Norman Paton", title = "Editorial: Special Issue on Improving the Veracity and Value of Big Data", journal = j-JDIQ, volume = "9", number = "3", pages = "13:1--13:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3174791", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bertossi:2018:OMD, author = "Leopoldo Bertossi and Mostafa Milani", title = "Ontological Multidimensional Data Models and Contextual Data Quality", journal = j-JDIQ, volume = "9", number = "3", pages = "14:1--14:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3148239", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data quality assessment and data cleaning are context-dependent activities. Motivated by this observation, we propose the Ontological Multidimensional Data Model (OMD model), which can be used to model and represent contexts as logic-based ontologies. The data under assessment are mapped into the context for additional analysis, processing, and quality data extraction. The resulting contexts allow for the representation of dimensions, and multidimensional data quality assessment becomes possible. At the core of a multidimensional context, we include a generalized multidimensional data model and a Datalog$^\pm $ ontology with provably good properties in terms of query answering. These main components are used to represent dimension hierarchies, dimensional constraints, and dimensional rules and define predicates for quality data specification. Query answering relies on and triggers navigation through dimension hierarchies and becomes the basic tool for the extraction of quality data. The OMD model is interesting per se beyond applications to data quality. It allows for a logic-based and computationally tractable representation of multidimensional data, extending previous multidimensional data models with additional expressive power and functionalities.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Mountantonakis:2018:SMM, author = "Michalis Mountantonakis and Yannis Tzitzikas", title = "Scalable Methods for Measuring the Connectivity and Quality of Large Numbers of Linked Datasets", journal = j-JDIQ, volume = "9", number = "3", pages = "15:1--15:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3165713", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Although the ultimate objective of Linked Data is linking and integration, it is not currently evident how connected the current Linked Open Data (LOD) cloud is. In this article, we focus on methods, supported by special indexes and algorithms, for performing measurements related to the connectivity of more than two datasets that are useful in various tasks including (a) Dataset Discovery and Selection; (b) Object Coreference, i.e., for obtaining complete information about a set of entities, including provenance information; (c) Data Quality Assessment and Improvement, i.e., for assessing the connectivity between any set of datasets and monitoring their evolution over time, as well as for estimating data veracity; (d) Dataset Visualizations; and various other tasks. Since it would be prohibitively expensive to perform all these measurements in a na{\"\i}ve way, in this article, we introduce indexes (and their construction algorithms) that can speed up such tasks. In brief, we introduce (i) a namespace-based prefix index, (ii) a sameAs catalog for computing the symmetric and transitive closure of the owl:sameAs relationships encountered in the datasets, (iii) a semantics-aware element index (that exploits the aforementioned indexes), and, finally, (iv) two lattice-based incremental algorithms for speeding up the computation of the intersection of URIs of any set of datasets. For enhancing scalability, we propose parallel index construction algorithms and parallel lattice-based incremental algorithms, we evaluate the achieved speedup using either a single machine or a cluster of machines, and we provide insights regarding the factors that affect efficiency. Finally, we report measurements about the connectivity of the (billion triples-sized) LOD cloud that have never been carried out so far.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Esteves:2018:TVA, author = "Diego Esteves and Anisa Rula and Aniketh Janardhan Reddy and Jens Lehmann", title = "Toward Veracity Assessment in {RDF} Knowledge Bases: an Exploratory Analysis", journal = j-JDIQ, volume = "9", number = "3", pages = "16:1--16:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177873", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Among different characteristics of knowledge bases, data quality is one of the most relevant to maximize the benefits of the provided information. Knowledge base quality assessment poses a number of big data challenges such as high volume, variety, velocity, and veracity. In this article, we focus on answering questions related to the assessment of the veracity of facts through Deep Fact Validation (DeFacto), a triple validation framework designed to assess facts in RDF knowledge bases. Despite current developments in the research area, the underlying framework faces many challenges. This article pinpoints and discusses these issues and conducts a thorough analysis of its pipeline, aiming at reducing the error propagation through its components. Furthermore, we discuss recent developments related to this fact validation as well as describing advantages and drawbacks of state-of-the-art models. As a result of this exploratory analysis, we give insights and directions toward a better architecture to tackle the complex task of fact-checking in knowledge bases.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chen:2018:CAS, author = "Qingyu Chen and Yu Wan and Xiuzhen Zhang and Yang Lei and Justin Zobel and Karin Verspoor", title = "Comparative Analysis of Sequence Clustering Methods for Deduplication of Biological Databases", journal = j-JDIQ, volume = "9", number = "3", pages = "17:1--17:??", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3131611", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The massive volumes of data in biological sequence databases provide a remarkable resource for large-scale biological studies. However, the underlying data quality of these resources is a critical concern. A particular challenge is duplication, in which multiple records have similar sequences, creating a high level of redundancy that impacts database storage, curation, and search. Biological database deduplication has two direct applications: for database curation, where detected duplicates are removed to improve curation efficiency, and for database search, where detected duplicate sequences may be flagged but remain available to support analysis. Clustering methods have been widely applied to biological sequences for database deduplication. Since an exhaustive all-by-all pairwise comparison of sequences cannot scale for a high volume of data, heuristic approaches have been recruited, such as the use of simple similarity thresholds. In this article, we present a comparison between CD-HIT and UCLUST, the two best-known clustering tools for sequence database deduplication. Our contributions include a detailed assessment of the redundancy remaining after deduplication, application of standard clustering evaluation metrics to quantify the cohesion and separation of the clusters generated by each method, and a biological case study that assesses intracluster function annotation consistency to demonstrate the impact of these factors on a practical application of the sequence clustering methods. Our results show that the trade-off between efficiency and accuracy becomes acute when low threshold values are used and when cluster sizes are large. This evaluation leads to practical recommendations for users for more effective uses of the sequence clustering tools for deduplication.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Gal:2018:CPD, author = "Avigdor Gal and Arik Senderovich and Matthias Weidlich", title = "Challenge Paper: Data Quality Issues in Queue Mining", journal = j-JDIQ, volume = "9", number = "4", pages = "18:1--18:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3165712", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Musyaffa:2018:EOF, author = "Fathoni A. Musyaffa and Christiane Engels and Maria-Esther Vidal and Fabrizio Orlandi and S{\"o}ren Auer", title = "Experience: Open Fiscal Datasets, Common Issues, and Recommendations", journal = j-JDIQ, volume = "9", number = "4", pages = "19:1--19:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190576", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Public administrations are continuously publishing open data, increasing the amount of government open data over time. The published data includes budgets and spending as part of fiscal data; publishing these data is an important part of transparent and accountable governance. However, open fiscal data should also meet open data publication guidelines. When requirements in data guidelines are not met, effective data analysis over published datasets cannot be performed effectively. In this article, we present Open Fiscal Data Publication (OFDP), a framework to assess the quality of open fiscal datasets. We also present an extensive open fiscal data assessment and common data quality issues found; additionally, open fiscal data publishing guidelines are presented. We studied and surveyed main quality factors for open fiscal datasets. Moreover, the collected quality factors have been scored according to the results of a questionnaire to score quality factors within the OFDP assessment framework. We gather and comprehensively analyze a representative set of 77 fiscal datasets from several public administrations across different regions at different levels (e.g., supranational, national, municipality). We characterize quality issues commonly arising in these datasets. Our assessment shows that there are many quality factors in fiscal data publication that still need to be taken care of so that the data can be analyzed effectively. Our proposed guidelines allow for publishing open fiscal data where these quality issues are avoided.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Alshayeb:2018:SSP, author = "Mohammad Alshayeb and Yasser Shaaban and Jarallah Al-Ghamdi", title = "{SPMDL}: Software Product Metrics Definition Language", journal = j-JDIQ, volume = "9", number = "4", pages = "20:1--20:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3185049", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Software metrics are becoming more acceptable measures for software quality assessment. However, there is no standard form to represent metric definitions, which would be useful for metrics exchange and customization. In this article, we propose the Software Product Metrics Definition Language (SPMDL). We develop an XML-based description language to define software metrics in a precise and reusable form. Metric definitions in SPMDL are based on meta-models extracted from either source code or design artifacts, such as the Dagstuhl Middle Meta-model, with support for various abstraction levels. The language defines several flexible computation mechanisms, such as extended Object Constraint Language queries and predefined graph operations on the meta-model. SPMDL provides an unambiguous description of the metric definition; it is also easy to use and is extensible.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ashish:2018:MRB, author = "Naveen Ashish and Arihant Patawari", title = "Machine Reading of Biomedical Data Dictionaries", journal = j-JDIQ, volume = "9", number = "4", pages = "21:1--21:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177874", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This article describes an approach for the automated reading of biomedical data dictionaries. Automated reading is the process of extracting element details for each of the data elements from a data dictionary in a document format (such as PDF) to a completely structured representation. A structured representation is essential if the data dictionary metadata are to be used in applications such as data integration and also in evaluating the quality of the associated data. We present an approach and implemented solution for the problem, considering different formats of data dictionaries. We have a particular focus on the most challenging format with a machine-learning classification solution to the problem using conditional random field classifiers. We present an evaluation using several actual data dictionaries, demonstrating the effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Chiang:2018:IPS, author = "Fei Chiang and Dhruv Gairola", title = "{InfoClean}: Protecting Sensitive Information in Data Cleaning", journal = j-JDIQ, volume = "9", number = "4", pages = "22:1--22:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190577", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data quality has become a pervasive challenge for organizations as they wrangle with large, heterogeneous datasets to extract value. Given the proliferation of sensitive and confidential information, it is crucial to consider data privacy concerns during the data cleaning process. For example, in medical database applications, varying levels of privacy are enforced across the attribute values. Attributes such as a patient's country or city of residence may be less sensitive than the patient's prescribed medication. Traditional data cleaning techniques assume the data is openly accessible, without considering the differing levels of information sensitivity. In this work, we take the first steps toward a data cleaning model that integrates privacy as part of the data cleaning process. We present a privacy-aware data cleaning framework that differentiates the information content among the attribute values during the data cleaning process to resolve data inconsistencies while minimizing the amount of information disclosed. Our data repair algorithm includes a set of data disclosure operations that considers the information content of the underlying attribute values, while maximizing data utility. Our evaluation using real datasets shows that our algorithm scales well, and achieves improved performance and comparable repair accuracy against existing data cleaning solutions.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bertino:2018:ACE, author = "Elisa Bertino and Mohammad R. Jahanshahi", title = "Adaptive and Cost-Effective Collection of High-Quality Data for Critical Infrastructure and Emergency Management in Smart Cities-Framework and Challenges", journal = j-JDIQ, volume = "10", number = "1", pages = "1:1--1:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190579", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Flores:2018:IQA, author = "Javier Flores and Jun Sun", title = "Information Quality Awareness and Information Quality Practice", journal = j-JDIQ, volume = "10", number = "1", pages = "2:1--2:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3182182", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Healthcare organizations increasingly rely on electronic information to optimize their operations. Information of high diversity from various sources accentuate the relevance and importance of information quality (IQ). The quality of information needs to be improved to support a more efficient and reliable utilization of healthcare information systems (IS). This can only be achieved through the implementation of initiatives followed by most users across an organization. The purpose of this study is to examine how awareness of IS users about IQ issues would affect their IQ behavior. Based on multiple theoretical frameworks, it is hypothesized that different aspects of user motivation mediate the relationship between the awareness on both beneficial and problematic situations and IQ practice inclination. In addition, social influence and facilitating condition moderate the relationship between IQ practice inclination and overt IQ practice. The theoretical and practical implications of findings are discussed, especially how to enhance IQ compliance in the healthcare settings.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bors:2018:VIC, author = "Christian Bors and Theresia Gschwandtner and Simone Kriglstein and Silvia Miksch and Margit Pohl", title = "Visual Interactive Creation, Customization, and Analysis of Data Quality Metrics", journal = j-JDIQ, volume = "10", number = "1", pages = "3:1--3:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3190578", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "During data preprocessing, analysts spend a significant part of their time and effort profiling the quality of the data along with cleansing and transforming the data for further analysis. While quality metrics-ranging from general to domain-specific measures-support assessment of the quality of a dataset, there are hardly any approaches to visually support the analyst in customizing and applying such metrics. Yet, visual approaches could facilitate users' involvement in data quality assessment. We present MetricDoc, an interactive environment for assessing data quality that provides customizable, reusable quality metrics in combination with immediate visual feedback. Moreover, we provide an overview visualization of these quality metrics along with error visualizations that facilitate interactive navigation of the data to determine the causes of quality issues present in the data. In this article, we describe the architecture, design, and evaluation of MetricDoc, which underwent several design cycles, including heuristic evaluation and expert reviews as well as a focus group with data quality, human-computer interaction, and visual analytics experts.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Zhang:2018:ASB, author = "Han Zhang and Shawndra Hill and David Rothschild", title = "Addressing Selection Bias in Event Studies with General-Purpose Social Media Panels", journal = j-JDIQ, volume = "10", number = "1", pages = "4:1--4:??", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3185048", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Data from Twitter have been employed in prior research to study the impacts of events. Conventionally, researchers use keyword-based samples of tweets to create a panel of Twitter users who mention event-related keywords during and after an event. However, the keyword-based sampling is limited in its objectivity dimension of data and information quality. First, the technique suffers from selection bias since users who discuss an event are already more likely to discuss event-related topics beforehand. Second, there are no viable control groups for comparison to a keyword-based sample of Twitter users. We propose an alternative sampling approach to construct panels of users defined by their geolocation. Geolocated panels are exogenous to the keywords in users' tweets, resulting in less selection bias than the keyword panel method. Geolocated panels allow us to follow within-person changes over time and enable the creation of comparison groups. We compare different panels in two real-world settings: response to mass shootings and TV advertising. We first show the strength of the selection biases of keyword panels. Then, we empirically illustrate how geolocated panels reduce selection biases and allow meaningful comparison groups regarding the impact of the studied events. We are the first to provide a clear, empirical example of how a better panel selection design, based on an exogenous variable such as geography, both reduces selection bias compared to the current state of the art and increases the value of Twitter research for studying events. While we advocate for the use of a geolocated panel, we also discuss its weaknesses and application scenario seriously. This article also calls attention to the importance of selection bias in impacting the objectivity of social media data.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Puentes:2018:CQE, author = "John Puentes and Pedro Merino Laso and David Brosset", title = "The Challenge of Quality Evaluation in Fraud Detection", journal = j-JDIQ, volume = "10", number = "2", pages = "5:1--5:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3228341", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3228341", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bertino:2018:CAC, author = "Elisa Bertino and Amani Abu Jabal and Seraphin Calo and Dinesh Verma and Christopher Williams", title = "The Challenge of Access Control Policies Quality", journal = j-JDIQ, volume = "10", number = "2", pages = "6:1--6:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209668", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3209668", abstract = "Access Control policies allow one to control data sharing among multiple subjects. For high assurance data security, it is critical that such policies be fit for their purpose. In this paper we introduce the notion of ``policy quality'' and elaborate on its many dimensions, such as consistency, completeness, and minimality. We introduce a framework supporting the analysis of policies with respect to the introduced quality dimensions and elaborate on research challenges, including policy analysis for large-scale distributed systems, assessment of policy correctness, and analysis of policies expressed in richer policy models.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Karanja:2018:CPT, author = "Evanson Mwangi Karanja and Shedden Masupe and Mandu Gasennelwe-Jeffrey", title = "Challenge Paper: Towards Open Datasets for {Internet of Things} Malware", journal = j-JDIQ, volume = "10", number = "2", pages = "7:1--7:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3230669", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Koumarelas:2018:EEA, author = "Ioannis Koumarelas and Axel Kroschk and Clifford Mosley and Felix Naumann", title = "Experience: Enhancing Address Matching with Geocoding and Similarity Measure Selection", journal = j-JDIQ, volume = "10", number = "2", pages = "8:1--8:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3232852", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Given a query record, record matching is the problem of finding database records that represent the same real-world object. In the easiest scenario, a database record is completely identical to the query. However, in most cases, problems do arise, for instance, as a result of data errors or data integrated from multiple sources or received from restrictive form fields. These problems are usually difficult, because they require a variety of actions, including field segmentation, decoding of values, and similarity comparisons, each requiring some domain knowledge. In this article, we study the problem of matching records that contain address information, including attributes such as Street-address and City. To facilitate this matching process, we propose a domain-specific procedure to, first, enrich each record with a more complete representation of the address information through geocoding and reverse-geocoding and, second, to select the best similarity measure per each address attribute that will finally help the classifier to achieve the best f-measure. We report on our experience in selecting geocoding services and discovering similarity measures for a concrete but common industry use-case.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ferro:2018:ISIa, author = "Nicola Ferro and Norbert Fuhr and Andreas Rauber", title = "Introduction to the Special Issue on Reproducibility in Information Retrieval: Evaluation Campaigns, Collections, and Analyses", journal = j-JDIQ, volume = "10", number = "3", pages = "9:1--9:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3268408", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Moffat:2018:EMU, author = "Alistair Moffat and Falk Scholer and Ziying Yang", title = "Estimating Measurement Uncertainty for Information Retrieval Effectiveness Metrics", journal = j-JDIQ, volume = "10", number = "3", pages = "10:1--10:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239572", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3239572", abstract = "One typical way of building test collections for offline measurement of information retrieval systems is to pool the ranked outputs of different systems down to some chosen depth d and then form relevance judgments for those documents only. Non-pooled documents-ones that did not appear in the top- d sets of any of the contributing systems-are then deemed to be non-relevant for the purposes of evaluating the relative behavior of the systems. In this article, we use RBP-derived residuals to re-examine the reliability of that process. By fitting the RBP parameter $ \phi $ to maximize similarity between AP- and NDCG-induced system rankings, on the one hand, and RBP-induced rankings, on the other, an estimate can be made as to the potential score uncertainty associated with those two recall-based metrics. We then consider the effect that residual size-as an indicator of possible measurement uncertainty in utility-based metrics-has in connection with recall-based metrics by computing the effect of increasing pool sizes and examining the trends that arise in terms of both metric score and system separability using standard statistical tests. The experimental results show that the confidence levels expressed via the p -values generated by statistical tests are only weakly connected to the size of the residual and to the degree of measurement uncertainty caused by the presence of unjudged documents. Statistical confidence estimates are, however, largely consistent as pooling depths are altered. We therefore recommend that all such experimental results should report, in addition to the outcomes of statistical significance tests, the residual measurements generated by a suitably matched weighted-precision metric, to give a clear indication of measurement uncertainty that arises due to the presence of unjudged documents in test collections with finite pooled judgments.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Roitero:2018:RGE, author = "Kevin Roitero and Marco Passon and Giuseppe Serra and Stefano Mizzaro", title = "{Reproduce}. {Generalize}. {Extend}. {On} Information Retrieval Evaluation without Relevance Judgments", journal = j-JDIQ, volume = "10", number = "3", pages = "11:1--11:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3241064", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3241064", abstract = "The evaluation of retrieval effectiveness by means of test collections is a commonly used methodology in the information retrieval field. Some researchers have addressed the quite fascinating research question of whether it is possible to evaluate effectiveness completely automatically, without human relevance assessments. Since human relevance assessment is one of the main costs of building a test collection, both in human time and money resources, this rather ambitious goal would have a practical impact. In this article, we reproduce the main results on evaluating information retrieval systems without relevance judgments; furthermore, we generalize such previous work to analyze the effect of test collections, evaluation metrics, and pool depth. We also expand the idea to semi-automatic evaluation and estimation of topic difficulty. Our results show that (i) previous work is overall reproducible, although some specific results are not; (ii) collection, metric, and pool depth impact the automatic evaluation of systems, which is anyway accurate in several cases; (iii) semi-automatic evaluation is an effective methodology; and (iv) automatic evaluation can (to some extent) be used to predict topic difficulty.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Roitero:2018:RIE, author = "Kevin Roitero and Michael Soprano and Andrea Brunello and Stefano Mizzaro", title = "Reproduce and Improve: an Evolutionary Approach to Select a Few Good Topics for Information Retrieval Evaluation", journal = j-JDIQ, volume = "10", number = "3", pages = "12:1--12:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239573", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3239573", abstract = "Effectiveness evaluation of information retrieval systems by means of a test collection is a widely used methodology. However, it is rather expensive in terms of resources, time, and money; therefore, many researchers have proposed methods for a cheaper evaluation. One particular approach, on which we focus in this article, is to use fewer topics: in TREC-like initiatives, usually system effectiveness is evaluated as the average effectiveness on a set of n topics (usually, n =50, but more than 1,000 have been also adopted); instead of using the full set, it has been proposed to find the best subsets of a few good topics that evaluate the systems in the most similar way to the full set. The computational complexity of the task has so far limited the analysis that has been performed. We develop a novel and efficient approach based on a multi-objective evolutionary algorithm. The higher efficiency of our new implementation allows us to reproduce some notable results on topic set reduction, as well as perform new experiments to generalize and improve such results. We show that our approach is able to both reproduce the main state-of-the-art results and to allow us to analyze the effect of the collection, metric, and pool depth used for the evaluation. Finally, differently from previous studies, which have been mainly theoretical, we are also able to discuss some practical topic selection strategies, integrating results of automatic evaluation approaches.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Jagerman:2018:OLL, author = "Rolf Jagerman and Krisztian Balog and Maarten {De Rijke}", title = "{OpenSearch}: Lessons Learned from an Online Evaluation Campaign", journal = j-JDIQ, volume = "10", number = "3", pages = "13:1--13:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239575", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:16:59 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3239575", abstract = "We report on our experience with TREC OpenSearch, an online evaluation campaign that enabled researchers to evaluate their experimental retrieval methods using real users of a live website. Specifically, we focus on the task of ad hoc document retrieval within the academic search domain, and work with two search engines, CiteSeerX and SSOAR, that provide us with traffic. We describe our experimental platform, which is based on the living labs methodology, and report on the experimental results obtained. We also share our experiences, challenges, and the lessons learned from running this track in 2016 and 2017.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ferro:2018:ISIb, author = "Nicola Ferro and Norbert Fuhr and Andreas Rauber", title = "Introduction to the Special Issue on Reproducibility in Information Retrieval: Tools and Infrastructures", journal = j-JDIQ, volume = "10", number = "4", pages = "14:1--14:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3268410", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Hopfgartner:2018:ESC, author = "Frank Hopfgartner and Allan Hanbury and Henning M{\"u}ller and Ivan Eggel and Krisztian Balog and Torben Brodt and Gordon V. Cormack and Jimmy Lin and Jayashree Kalpathy-Cramer and Noriko Kando and Makoto P. Kato and Anastasia Krithara and Tim Gollub and Martin Potthast and Evelyne Viegas and Simon Mercer", title = "Evaluation-as-a-Service for the Computational Sciences: Overview and Outlook", journal = j-JDIQ, volume = "10", number = "4", pages = "15:1--15:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239570", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Evaluation in empirical computer science is essential to show progress and assess technologies developed. Several research domains such as information retrieval have long relied on systematic evaluation to measure progress: here, the Cranfield paradigm of creating shared test collections, defining search tasks, and collecting ground truth for these tasks has persisted up until now. In recent years, however, several new challenges have emerged that do not fit this paradigm very well: extremely large data sets, confidential data sets as found in the medical domain, and rapidly changing data sets as often encountered in industry. Crowdsourcing has also changed the way in which industry approaches problem-solving with companies now organizing challenges and handing out monetary awards to incentivize people to work on their challenges, particularly in the field of machine learning. This article is based on discussions at a workshop on Evaluation-as-a-Service (EaaS). EaaS is the paradigm of not providing data sets to participants and have them work on the data locally, but keeping the data central and allowing access via Application Programming Interfaces (API), Virtual Machines (VM), or other possibilities to ship executables. The objectives of this article are to summarize and compare the current approaches and consolidate the experiences of these approaches to outline the next steps of EaaS, particularly toward sustainable research infrastructures. The article summarizes several existing approaches to EaaS and analyzes their usage scenarios and also the advantages and disadvantages. The many factors influencing EaaS are summarized, and the environment in terms of motivations for the various stakeholders, from funding agencies to challenge organizers, researchers and participants, to industry interested in supplying real-world problems for which they require solutions. EaaS solves many problems of the current research environment, where data sets are often not accessible to many researchers. Executables of published tools are equally often not available making the reproducibility of results impossible. EaaS, however, creates reusable/citable data sets as well as available executables. Many challenges remain, but such a framework for research can also foster more collaboration between researchers, potentially increasing the speed of obtaining research results.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Yang:2018:ARR, author = "Peilin Yang and Hui Fang and Jimmy Lin", title = "{Anserini}: Reproducible Ranking Baselines Using {Lucene}", journal = j-JDIQ, volume = "10", number = "4", pages = "16:1--16:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239571", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This work tackles the perennial problem of reproducible baselines in information retrieval research, focusing on bag-of-words ranking models. Although academic information retrieval researchers have a long history of building and sharing systems, they are primarily designed to facilitate the publication of research papers. As such, these systems are often incomplete, inflexible, poorly documented, difficult to use, and slow, particularly in the context of modern web-scale collections. Furthermore, the growing complexity of modern software ecosystems and the resource constraints most academic research groups operate under make maintaining open-source systems a constant struggle. However, except for a small number of companies (mostly commercial web search engines) that deploy custom infrastructure, Lucene has become the de facto platform in industry for building search applications. Lucene has an active developer base, a large audience of users, and diverse capabilities to work with heterogeneous collections at scale. However, it lacks systematic support for ad hoc experimentation using standard test collections. We describe Anserini, an information retrieval toolkit built on Lucene that fills this gap. Our goal is to simplify ad hoc experimentation and allow researchers to easily reproduce results with modern bag-of-words ranking models on diverse test collections. With Anserini, we demonstrate that Lucene provides a suitable framework for supporting information retrieval research. Experiments show that our system efficiently indexes large web collections, provides modern ranking models that are on par with research implementations in terms of effectiveness, and supports low-latency query evaluation to facilitate rapid experimentation", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Kiesel:2018:RWC, author = "Johannes Kiesel and Florian Kneist and Milad Alshomary and Benno Stein and Matthias Hagen and Martin Potthast", title = "Reproducible {Web} Corpora: Interactive Archiving with Automatic Quality Assessment", journal = j-JDIQ, volume = "10", number = "4", pages = "17:1--17:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3239574", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The evolution of web pages from static HTML pages toward dynamic pieces of software has rendered archiving them increasingly difficult. Nevertheless, an accurate, reproducible web archive is a necessity to ensure the reproducibility of web-based research. Archiving web pages reproducibly, however, is currently not part of best practices for web corpus construction. As a result, and despite the ongoing efforts of other stakeholders to archive the web, tools for the construction of reproducible web corpora are insufficient or ill-fitted. This article presents a new tool tailored to this purpose. It relies on emulating user interactions with a web page while recording all network traffic. The customizable user interactions can be replayed on demand, while requests sent by the archived page are served with the recorded responses. The tool facilitates reproducible user studies, user simulations, and evaluations of algorithms that rely on extracting data from web pages. To evaluate our tool, we conduct the first systematic assessment of reproduction quality for rendered web pages. Using our tool, we create a corpus of 10,000 web pages carefully sampled from the Common Crawl and manually annotated with regard to reproduction quality via crowdsourcing. Based on this data, we test three approaches to automatic reproduction-quality assessment. An off-the-shelf neural network, trained on visual differences between the web page during archiving and reproduction, matches the manual assessments best. This automatic assessment of reproduction quality allows for immediate bugfixing during archiving and continuous development of our tool as the web continues to evolve.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Roy:2018:CCD, author = "Dwaipayan Roy and Mandar Mitra and Debasis Ganguly", title = "To Clean or Not to Clean: Document Preprocessing and Reproducibility", journal = j-JDIQ, volume = "10", number = "4", pages = "18:1--18:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242180", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Web document collections such as WT10G, GOV2, and ClueWeb are widely used for text retrieval experiments. Documents in these collections contain a fair amount of non-content-related markup in the form of tags, hyperlinks, and so on. Published articles that use these corpora generally do not provide specific details about how this markup information is handled during indexing. However, this question turns out to be important: Through experiments, we find that including or excluding metadata in the index can produce significantly different results with standard IR models. More importantly, the effect varies across models and collections. For example, metadata filtering is found to be generally beneficial when using BM25, or language modeling with Dirichlet smoothing, but can significantly reduce retrieval effectiveness if language modeling is used with Jelinek-Mercer smoothing. We also observe that, in general, the performance differences become more noticeable as the amount of metadata in the test collections increase. Given this variability, we believe that the details of document preprocessing are significant from the point of view of reproducibility. In a second set of experiments, we also study the effect of preprocessing on query expansion using RM3. In this case, once again, we find that it is generally better to remove markup before using documents for query expansion.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Srivastava:2019:EHQ, author = "Divesh Srivastava and Monica Scannapieco and Thomas C. Redman", title = "Ensuring High-Quality Private Data for Responsible Data Science: Vision and Challenges", journal = j-JDIQ, volume = "11", number = "1", pages = "1:1--1:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3287168", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3287168", abstract = "High-quality data is critical for effective data science. As the use of data science has grown, so too have concerns that individuals' rights to privacy will be violated. This has led to the development of data protection regulations around the globe and the use of sophisticated anonymization techniques to protect privacy. Such measures make it more challenging for the data scientist to understand the data, exacerbating issues of data quality. Responsible data science aims to develop useful insights from the data while fully embracing these considerations. We pose the high-level problem in this article, ``How can a data scientist develop the needed trust that private data has high quality?'' We then identify a series of challenges for various data-centric communities and outline research questions for data quality and privacy researchers, which would need to be addressed to effectively answer the problem posed in this article.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Rios:2019:CTF, author = "Julio C{\'e}sar Cort{\'e}s R{\'\i}os and Norman W. Paton and Alvaro A. A. Fernandes and Edward Abel and John A. Keane", title = "Crowdsourced Targeted Feedback Collection for Multicriteria Data Source Selection", journal = j-JDIQ, volume = "11", number = "1", pages = "2:1--2:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3284934", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3284934", abstract = "A multicriteria data source selection (MCSS) scenario identifies, from a set of candidate data sources, the subset that best meets users' needs. These needs are expressed using several criteria, which are used to evaluate the candidate data sources. An MCSS problem can be solved using multidimensional optimization techniques that trade off the different objectives. Sometimes one may have uncertain knowledge regarding how well the candidate data sources meet the criteria. In order to overcome this uncertainty, one may rely on end-users or crowds to annotate the data items produced by the sources in relation to the selection criteria. In this article, a proposed Targeted Feedback Collection (TFC) approach is introduced that aims to identify those data items on which feedback should be collected, thereby providing evidence on how the sources satisfy the required criteria. The proposed TFC targets feedback by considering the confidence intervals around the estimated criteria values, with a view to increasing the confidence in the estimates that are most relevant to the multidimensional optimization. Variants of the proposed TFC approach have been developed for use where feedback is expected to be reliable (e.g., where it is provided by trusted experts) and where feedback is expected to be unreliable (e.g., from crowd workers). Both variants have been evaluated, and positive results are reported against other approaches to feedback collection, including active learning, in experiments that involve real-world datasets and crowdsourcing.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Dallachiesa:2019:ICQ, author = "Michele Dallachiesa and Charu C. Aggarwal and Themis Palpanas", title = "Improving Classification Quality in Uncertain Graphs", journal = j-JDIQ, volume = "11", number = "1", pages = "3:1--3:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3242095", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3242095", abstract = "In many real applications that use and analyze networked data, the links in the network graph may be erroneous or derived from probabilistic techniques. In such cases, the node classification problem can be challenging, since the unreliability of the links may affect the final results of the classification process. If the information about link reliability is not used explicitly, then the classification accuracy in the underlying network may be affected adversely. In this article, we focus on situations that require the analysis of the uncertainty that is present in the graph structure. We study the novel problem of node classification in uncertain graphs, by treating uncertainty as a first-class citizen. We propose two techniques based on a Bayes model and automatic parameter selection and show that the incorporation of uncertainty in the classification process as a first-class citizen is beneficial. We experimentally evaluate the proposed approach using different real data sets and study the behavior of the algorithms under different conditions. The results demonstrate the effectiveness and efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Casey:2019:FRR, author = "K. Michael Casey and Kevin {Casey Jr.}", title = "Financial Regulatory and Risk Management Challenges Stemming from Firm-Specific Digital Misinformation", journal = j-JDIQ, volume = "11", number = "1", pages = "4:1--4:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3274655", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:00 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3274655", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Fan:2019:DGC, author = "Wenfei Fan", title = "Dependencies for Graphs: Challenges and Opportunities", journal = j-JDIQ, volume = "11", number = "2", pages = "5:1--5:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310230", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3310230", abstract = "What are graph dependencies? What do we need them for? What new challenges do they introduce? This article tackles these questions. It aims to incite curiosity and interest in this emerging area of research.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Sillaber:2019:EDI, author = "Christian Sillaber and Andrea Mussmann and Ruth Breu", title = "Experience: Data and Information Quality Challenges in Governance, Risk, and Compliance Management", journal = j-JDIQ, volume = "11", number = "2", pages = "6:1--6:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3297721", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3297721", abstract = "Governance, risk, and compliance (GRC) managers often struggle to document the current state of their organizations. This is due to the complexity of their IS landscape, the complex regulatory and organizational environment, and the frequent changes to both. GRC tools seek to support them by integrating existing information sources. However, a comprehensive analysis of how the data is managed in such tools, as well as the impact of data quality, is still missing. To build a basis of empirical data, we conducted a series of interviews with information security managers responsible for GRC management activities in their organizations. The results of a qualitative content analysis of these interviews suggest that decision makers largely depend on high-quality documentation but struggle to maintain their documentation at the required level for long periods of time. This work discusses factors affecting the quality of GRC data and information and provides insights into approaches implemented by organizations to analyze, improve, and maintain the quality of their GRC data and information.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Lazar:2019:EEM, author = "Alina Lazar and Ling Jin and C. Anna Spurlock and Kesheng Wu and Alex Sim and Annika Todd", title = "Evaluating the Effects of Missing Values and Mixed Data Types on Social Sequence Clustering Using {t-SNE} Visualization", journal = j-JDIQ, volume = "11", number = "2", pages = "7:1--7:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3301294", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301294", abstract = "The goal of this work is to investigate the impact of missing values in clustering joint categorical social sequences. Identifying patterns in sociodemographic longitudinal data is important in a number of social science settings. However, performing analytical operations, such as clustering on life course trajectories, is challenging due to the categorical and multidimensional nature of the data, their mixed data types, and corruption by missing and inconsistent values. Data quality issues were investigated previously on single variable sequences. To understand their effects on multivariate sequence analysis, we employ a dataset of mixed data types and missing values, a dissimilarity measure designed for joint categorical sequence data, together with dimensionality reduction methodologies in a systematic design of sequence clustering experiments. Given the categorical nature of our data, we employ an ``edit'' distance using optimal matching. Because each data record has multiple variables of different types, we investigate the impact of mixing these variables in a single dissimilarity measure. Between variables with binary values and those with multiple nominal values, we find that the ability to overcome missing data problems is more difficult in the nominal domain than in the binary domain. Additionally, alignment of leading missing values can result in systematic biases in dissimilarity matrices and subsequently introduce both artificial clusters and unrealistic interpretations of associated data domains. We demonstrate the usage of t-distributed stochastic neighborhood embedding to visually guide mitigation of such biases by tuning the missing value substitution cost parameter or determining an optimal sequence span.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Muller:2019:ADQ, author = "Daniel M{\"u}ller and Pratiksha Jain and Yieh-Funk Te", title = "Augmenting Data Quality through High-Precision Gender Categorization", journal = j-JDIQ, volume = "11", number = "2", pages = "8:1--8:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3297720", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3297720", abstract = "Mappings of first name to gender have been widely recognized as a critical tool for the completion, study, and validation of data records in a range of areas. In this study, we investigate how organizations with large databases of existing entities can create their own mappings between first names and gender and how these mappings can be improved and utilized. Therefore, we first explore a dataset with demographic information on more than 4 million people, which was provided by a car insurance company. Then, we study how naming conventions have changed over time and how they differ by nationality. Next, we build a probabilistic first-name-to-gender mapping and augment the mapping by adding nationality and decade of birth to improve the mapping's performance. We test our mapping in two-label and three-label settings and further validate our mapping by categorizing patent filings by gender of the inventor. We compare the results with previous studies' outcomes and find that our mapping produces high-precision results. We validate that the additional information of nationality and year of birth improve the precision scores of name-to-gender mappings. Therefore, the proposed approach constitutes an efficient process for improving the data quality of organizations' records, if the gender attribute is missing or unreliable.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Hassan:2019:ISI, author = "Naeemul Hassan and Chengkai Li and Jun Yang and Cong Yu", title = "Introduction to the Special Issue on Combating Digital Misinformation and Disinformation", journal = j-JDIQ, volume = "11", number = "3", pages = "9:1--9:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321484", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3321484", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Zannettou:2019:WFI, author = "Savvas Zannettou and Michael Sirivianos and Jeremy Blackburn and Nicolas Kourtellis", title = "The {Web} of False Information: Rumors, Fake News, Hoaxes, Clickbait, and Various Other Shenanigans", journal = j-JDIQ, volume = "11", number = "3", pages = "10:1--10:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309699", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309699", abstract = "A new era of Information Warfare has arrived. Various actors, including state-sponsored ones, are weaponizing information on Online Social Networks to run false-information campaigns with targeted manipulation of public opinion on specific topics. These false-information campaigns can have dire consequences to the public: mutating their opinions and actions, especially with respect to critical world events like major elections. Evidently, the problem of false information on the Web is a crucial one and needs increased public awareness as well as immediate attention from law enforcement agencies, public institutions, and in particular, the research community. In this article, we make a step in this direction by providing a typology of the Web's false-information ecosystem, composed of various types of false-information, actors, and their motives. We report a comprehensive overview of existing research on the false-information ecosystem by identifying several lines of work: (1) how the public perceives false information; (2) understanding the propagation of false information; (3) detecting and containing false information on the Web; and (4) false information on the political stage. In this work, we pay particular attention to political false information as: (1) it can have dire consequences to the community (e.g., when election results are mutated) and (2) previous work shows that this type of false information propagates faster and further when compared to other types of false information. Finally, for each of these lines of work, we report several future research directions that can help us better understand and mitigate the emerging problem of false-information dissemination on the Web.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Xue:2019:CAT, author = "Hao Xue and Qiaozhi Wang and Bo Luo and Hyunjin Seo and Fengjun Li", title = "Content-Aware Trust Propagation Toward Online Review Spam Detection", journal = j-JDIQ, volume = "11", number = "3", pages = "11:1--11:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3305258", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3305258", abstract = "With the increasing popularity of online review systems, a large volume of user-generated content becomes available to help people make reasonable judgments about the quality of services and products from unknown providers. However, these platforms are frequently abused since fraudulent information can be freely inserted by potentially malicious users without validation. Consequently, online review systems become targets of individual and professional spammers, who insert deceptive reviews by manipulating the rating and/or the content of the reviews. In this work, we propose a review spamming detection scheme based on the deviation between the aspect-specific opinions extracted from individual reviews and the aggregated opinions on the corresponding aspects. In particular, we model the influence on the trustworthiness of the user due to his opinion deviations from the majority in the form of a deviation-based penalty, and integrate this penalty into a three-layer trust propagation framework to iteratively compute the trust scores for users, reviews, and review targets, respectively. The trust scores are effective indicators of spammers, since they reflect the overall deviation of a user from the aggregated aspect-specific opinions across all targets and all aspects. Experiments on the dataset collected from Yelp.com show that the proposed detection scheme based on aspect-specific content-aware trust propagation is able to measure users' trustworthiness based on opinions expressed in reviews.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Atanasova:2019:AFC, author = "Pepa Atanasova and Preslav Nakov and Llu{\'\i}s M{\`a}rquez and Alberto Barr{\'o}n-Cede{\~n}o and Georgi Karadzhov and Tsvetomila Mihaylova and Mitra Mohtarami and James Glass", title = "Automatic Fact-Checking Using Context and Discourse Information", journal = j-JDIQ, volume = "11", number = "3", pages = "12:1--12:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3297722", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3297722", abstract = "We study the problem of automatic fact-checking, paying special attention to the impact of contextual and discourse information. We address two related tasks: (i) detecting check-worthy claims and (ii) fact-checking claims. We develop supervised systems based on neural networks, kernel-based support vector machines, and combinations thereof, which make use of rich input representations in terms of discourse cues and contextual features. For the check-worthiness estimation task, we focus on political debates, and we model the target claim in the context of the full intervention of a participant and the previous and following turns in the debate, taking into account contextual meta information. For the fact-checking task, we focus on answer verification in a community forum, and we model the veracity of the answer with respect to the entire question-answer thread in which it occurs as well as with respect to other related posts from the entire forum. We develop annotated datasets for both tasks and we run extensive experimental evaluation, confirming that both types of information-but especially contextual features-play an important role.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Lin:2019:DPF, author = "Peng Lin and Qi Song and Yinghui Wu and Jiaxing Pi", title = "Discovering Patterns for Fact Checking in Knowledge Graphs", journal = j-JDIQ, volume = "11", number = "3", pages = "13:1--13:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3286488", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3286488", abstract = "This article presents a new framework that incorporates graph patterns to support fact checking in knowledge graphs. Our method discovers discriminant graph patterns to construct classifiers for fact prediction. First, we propose a class of graph fact checking rules (GFCs). A GFC incorporates graph patterns that best distinguish true and false facts of generalized fact statements. We provide statistical measures to characterize useful patterns that are both discriminant and diversified. Second, we show that it is feasible to discover GFCs in large graphs with optimality guarantees. We develop an algorithm that performs localized search to generate a stream of graph patterns, and dynamically assemble the best GFCs from multiple GFC sets, where each set ensures quality scores within certain ranges. The algorithm guarantees a $ (1 / 2 - \epsilon) $ approximation when it (early) terminates. We also develop a space-efficient alternative that dynamically spawns prioritized patterns with best marginal gains to the verified GFCs. It guarantees a $ (1 - 1 / e) $ approximation. Both strategies guarantee a bounded time cost independent of the size of the underlying graph. Third, to support fact checking, we develop two classifiers, which make use of top-ranked GFCs as predictive rules or instance-level features of the pattern matches induced by GFCs, respectively. Using real-world data, we experimentally verify the efficiency and the effectiveness of GFC-based techniques for fact checking in knowledge graphs and verify its application in knowledge exploration and news prediction.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Borges:2019:CSF, author = "Lu{\'\i}s Borges and Bruno Martins and P{\'a}vel Calado", title = "Combining Similarity Features and Deep Representation Learning for Stance Detection in the Context of Checking Fake News", journal = j-JDIQ, volume = "11", number = "3", pages = "14:1--14:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3287763", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3287763", abstract = "Fake news is nowadays an issue of pressing concern, given its recent rise as a potential threat to high-quality journalism and well-informed public discourse. The Fake News Challenge (FNC-1) was organized in early 2017 to encourage the development of machine-learning-based classification systems for stance detection (i.e., for identifying whether a particular news article agrees, disagrees, discusses, or is unrelated to a particular news headline), thus helping in the detection and analysis of possible instances of fake news. This article presents a novel approach to tackle this stance detection problem, based on the combination of string similarity features with a deep neural network architecture that leverages ideas previously advanced in the context of learning-efficient text representations, document classification, and natural language inference. Specifically, we use bi-directional Recurrent Neural Networks (RNNs), together with max-pooling over the temporal/sequential dimension and neural attention, for representing (i) the headline, (ii) the first two sentences of the news article, and (iii) the entire news article. These representations are then combined/compared, complemented with similarity features inspired on other FNC-1 approaches, and passed to a final layer that predicts the stance of the article toward the headline. We also explore the use of external sources of information, specifically large datasets of sentence pairs originally proposed for training and evaluating natural language inference methods to pre-train specific components of the neural network architecture (e.g., the RNNs used for encoding sentences). The obtained results attest to the effectiveness of the proposed ideas and show that our model, particularly when considering pre-training and the combination of neural representations together with similarity features, slightly outperforms the previous state of the art.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Abiteboul:2019:TFD, author = "Serge Abiteboul and Julia Stoyanovich", title = "Transparency, Fairness, Data Protection, Neutrality: Data Management Challenges in the Face of New Regulation", journal = j-JDIQ, volume = "11", number = "3", pages = "15:1--15:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310231", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3310231", abstract = "The data revolution continues to transform every sector of science, industry, and government. Due to the incredible impact of data-driven technology on society, we are becoming increasingly aware of the imperative to use data and algorithms responsibly-in accordance with laws and ethical norms. In this article, we discuss three recent regulatory frameworks: the European Union's General Data Protection Regulation (GDPR), the New York City Automated Decisions Systems (ADS) Law, and the Net Neutrality principle, which aim to protect the rights of individuals who are impacted by data collection and analysis. These frameworks are prominent examples of a global trend: Governments are starting to recognize the need to regulate data-driven algorithmic technology. Our goal in this article is to bring these regulatory frameworks to the attention of the data management community and to underscore the technical challenges they raise and that we, as a community, are well-equipped to address. The main takeaway of this article is that legal and ethical norms cannot be incorporated into data-driven systems as an afterthought. Rather, we must think in terms of responsibility by design, viewing it as a systems requirement.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bertino:2019:DTB, author = "Elisa Bertino and Ahish Kundu and Zehra Sura", title = "Data Transparency with Blockchain and {AI} Ethics", journal = j-JDIQ, volume = "11", number = "4", pages = "16:1--16:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3312750", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3312750", abstract = "Providing a 360${}^\circ $ view of a given data item especially for sensitive data is essential toward not only protecting the data and associated privacy but also assuring trust, compliance, and ethics of the systems that use or manage such data. With the advent of General Data Protection Regulation, California Data Privacy Law, and other such regulatory requirements, it is essential to support data transparency in all such dimensions. Moreover, data transparency should not violate privacy and security requirements. In this article, we put forward a vision for how data transparency would be achieved in a de-centralized fashion using blockchain technology.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Fard:2019:ARA, author = "Amir Ebrahimi Fard and Scott Cunningham", title = "Assessing the Readiness of Academia in the Topic of False and Unverified Information", journal = j-JDIQ, volume = "11", number = "4", pages = "17:1--17:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3313788", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3313788", abstract = "The spread of false and unverified information has the potential to inflict damage by harming the reputation of individuals or organisations, shaking financial markets, and influencing crowd decisions in important events. This phenomenon needs to be properly curbed, otherwise it can contaminate other aspects of our social life. In this regard, academia as a key institution against false and unverified information is expected to play a pivotal role. Despite a great deal of research in this arena, the amount of progress by academia is not clear yet. This can lead to misjudgements about the performance of the topic of interest that can ultimately result in wrong science policies regarding academic efforts for quelling false and unverified information. In this research, we address this issue by assessing the readiness of academia in the topic of false and unverified information. To this end, we adopt the emergence framework and measure its dimensions (novelty, growth, coherence, and impact) over more than 21,000 articles published by academia about false and unverified information. Our results show the current body of research has had organic growth so far, which is not promising enough for confronting the problem of false and unverified information. To tackle this problem, we suggest an external push strategy that, compared to the early stages of the topic of interest, reinforces the emergence dimensions and leads to a higher level in every dimension.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Babcock:2019:DFF, author = "Matthew Babcock and David M. Beskow and Kathleen M. Carley", title = "Different Faces of False: The Spread and Curtailment of False Information in the {Black Panther Twitter} Discussion", journal = j-JDIQ, volume = "11", number = "4", pages = "18:1--18:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3339468", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3339468", abstract = "The task of combating false information online appears daunting, in part due to a public focus on how quickly it can spread and the clamor for automated platform-based interventions. While such concerns can be warranted, threat analysis and intervention design both benefit from a fuller understanding of different types of false information and of the community responses to them. Here, we present a study of the most tweeted about movie ever ( Black Panther ) in which the spread of false information of four different types is compared to the ad hoc Twitter community response. We find that (1) false information tweets played a small part in the overall conversation, (2) community-based debunking and shaming responses to false posts about attacks at theaters overwhelmed such posts by orders of magnitude, (3) as another form of community response, one type of false narrative (Satire) was used to attack another (Fake Attacks), and (4) the four types of false-information tweets differed in the use of hashtags and in the role played by originating users and responding users. Overall, this work helps to illustrate the importance of investigating ``on-the-ground'' community responses to fake news and other types of digital false information and to inform identification and intervention design and implementation.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Bosu:2019:EQB, author = "Michael F. Bosu and Stephen G. Macdonell", title = "Experience: Quality Benchmarking of Datasets Used in Software Effort Estimation", journal = j-JDIQ, volume = "11", number = "4", pages = "19:1--19:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328746", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3328746", abstract = "Data is a cornerstone of empirical software engineering (ESE) research and practice. Data underpin numerous process and project management activities, including the estimation of development effort and the prediction of the likely location and severity of defects in code. Serious questions have been raised, however, over the quality of the data used in ESE. Data quality problems caused by noise, outliers, and incompleteness have been noted as being especially prevalent. Other quality issues, although also potentially important, have received less attention. In this study, we assess the quality of 13 datasets that have been used extensively in research on software effort estimation. The quality issues considered in this article draw on a taxonomy that we published previously based on a systematic mapping of data quality issues in ESE. Our contributions are as follows: (1) an evaluation of the ``fitness for purpose'' of these commonly used datasets and (2) an assessment of the utility of the taxonomy in terms of dataset benchmarking. We also propose a template that could be used to both improve the ESE data collection/submission process and to evaluate other such datasets, contributing to enhanced awareness of data quality issues in the ESE community and, in time, the availability and use of higher-quality datasets.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Ding:2019:CSA, author = "Junhua Ding and Xinchuan Li and Xiaojun Kang and Venkat N. Gudivada", title = "A Case Study of the Augmentation and Evaluation of Training Data for Deep Learning", journal = j-JDIQ, volume = "11", number = "4", pages = "20:1--20:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3317573", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3317573", abstract = "Deep learning has been widely used for extracting values from big data. As many other machine learning algorithms, deep learning requires significant training data. Experiments have shown both the volume and the quality of training data can significantly impact the effectiveness of the value extraction. In some cases, the volume of training data is not sufficiently large for effectively training a deep learning model. In other cases, the quality of training data is not high enough to achieve the optimal performance. Many approaches have been proposed for augmenting training data to mitigate the deficiency. However, whether the augmented data are ``fit for purpose'' of deep learning is still a question. A framework for comprehensively evaluating the effectiveness of the augmented data for deep learning is still not available. In this article, we first discuss a data augmentation approach for deep learning. The approach includes two components: the first one is to remove noisy data in a dataset using a machine learning based classification to improve its quality, and the second one is to increase the volume of the dataset for effectively training a deep learning model. To evaluate the quality of the augmented data in fidelity, variety, and veracity, a data quality evaluation framework is proposed. We demonstrated the effectiveness of the data augmentation approach and the data quality evaluation framework through studying an automated classification of biology cell images using deep learning. The experimental results clearly demonstrated the impact of the volume and quality of training data to the performance of deep learning and the importance of the data quality evaluation. The data augmentation approach and the data quality evaluation framework can be straightforwardly adapted for deep learning study in other domains.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Akhtar:2019:IAV, author = "Zahaib Akhtar and Anh Minh Le and Yun Seong Nam and Jessica Chen and Ramesh Govindan and Ethan Katz-Bassett and Sanjay Rao and Jibin Zhan", title = "Improving Adaptive Video Streaming through Session Classification", journal = j-JDIQ, volume = "11", number = "4", pages = "21:1--21:??", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309682", ISSN = "1936-1955", bibdate = "Tue Oct 22 07:17:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309682", abstract = "With internet video gaining increasing popularity and soaring to dominate network traffic, extensive studies are being carried out on how to achieve higher Quality of Experience (QoE) with the delivery of video content. Associated with the chunk-based streaming protocol, Adaptive Bitrate (ABR) algorithms have recently emerged to cope with the diverse and fluctuating network conditions by dynamically adjusting bitrates for future chunks. This inevitably involves predicting the future throughput of a video session. Some of the session features like Internet Service Provider (ISP), geographical location, and so on, could affect network conditions and contain helpful information for this throughput prediction. In this article, we consider how our knowledge about the session features can be utilized to improve ABR quality via customized parameter settings. We present our ABR-independent, QoE-driven, feature-based partition method to classify the logged video sessions so that different parameter settings could be adopted in different situations to reach better quality. A variation of Decision Tree is developed for the classification and has been applied to a sample ABR for evaluation. The experiment shows that our approach can improve the average bitrate of the sample ABR by 36.1\% without causing the increase of the rebuffering ratio where 99\% of the sessions can get improvement. It can also improve the rebuffering ratio by 87.7\% without causing the decrease of the average bitrate, where, among those sessions involved in rebuffering, 82\% receives improvement and 18\% remains the same.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1191", } @Article{Milo:2020:GRD, author = "Tova Milo", title = "Getting Rid of Data", journal = j-JDIQ, volume = "12", number = "1", pages = "1:1--1:7", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3326920", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3326920", abstract = "We are experiencing an amazing data-centered revolution. Incredible amounts of data are collected, integrated, and analyzed, leading to key breakthroughs in science and society. This well of knowledge, however, is at a great risk if we do not dispense \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Firmani:2020:EDD, author = "Donatella Firmani and Letizia Tanca and Riccardo Torlone", title = "Ethical Dimensions for Data Quality", journal = j-JDIQ, volume = "12", number = "1", pages = "2:1--2:5", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3362121", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3362121", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Draisbach:2020:TPD, author = "Uwe Draisbach and Peter Christen and Felix Naumann", title = "Transforming Pairwise Duplicates to Entity Clusters for High-quality Duplicate Detection", journal = j-JDIQ, volume = "12", number = "1", pages = "3:1--3:30", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3352591", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3352591", abstract = "Duplicate detection algorithms produce clusters of database records, each cluster representing a single real-world entity. As most of these algorithms use pairwise comparisons, the resulting (transitive) clusters can be inconsistent: Not all records \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Shakeel:2020:ASQ, author = "Yusra Shakeel and Jacob Kr{\"u}ger and Ivonne Von Nostitz-Wallwitz and Gunter Saake and Thomas Leich", title = "Automated Selection and Quality Assessment of Primary Studies: a Systematic Literature Review", journal = j-JDIQ, volume = "12", number = "1", pages = "4:1--4:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3356901", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3356901", abstract = "Researchers use systematic literature reviews (SLRs) to synthesize existing evidence regarding a research topic. While being an important means to condense knowledge, conducting an SLR requires a large amount of time and effort. Consequently, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Siagian:2020:RWC, author = "Al Hafiz Akbar Maulana Siagian and Masayoshi Aritsugi", title = "Robustness of Word and Character {$N$}-gram Combinations in Detecting Deceptive and Truthful Opinions", journal = j-JDIQ, volume = "12", number = "1", pages = "5:1--5:24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3349536", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3349536", abstract = "Opinions in reviews about the quality of products or services can be important information for readers. Unfortunately, such opinions may include deceptive ones posted for some business reasons. To keep the opinions as a valuable and trusted source of \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Aswani:2020:EMM, author = "Reema Aswani and Arpan Kumar Kar and P. Vigneswara Ilavarasan", title = "Experience: Managing Misinformation in Social Media-Insights for Policymakers from {Twitter} Analytics", journal = j-JDIQ, volume = "12", number = "1", pages = "6:1--6:18", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3341107", ISSN = "1936-1955", bibdate = "Thu Jan 23 07:39:46 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3341107", abstract = "Governance of misinformation is a serious concern in social media platforms. Based on experiences gathered from different case studies, we offer insights for the policymakers on managing misinformation in social media. These platforms are widely used \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Rula:2020:ESI, author = "Anisa Rula and Amrapali Zaveri and Elena Simperl and Elena Demidova", title = "Editorial: Special Issue on Quality Assessment of Knowledge Graphs Dedicated to the Memory of {Amrapali Zaveri}", journal = j-JDIQ, volume = "12", number = "2", pages = "7:1--7:4", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388748", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue May 19 09:08:07 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3388748", abstract = "This editorial summarizes the content of the Special Issue on Quality Assessment of Knowledge Graphs of the Journal of Data and Information Quality (JDIQ). We dedicate this special issue to the memory of our colleague and friend Amrapali Zaveri.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Ahmadi:2020:MER, author = "Naser Ahmadi and Viet-Phi Huynh and Vamsi Meduri and Stefano Ortona and Paolo Papotti", title = "Mining Expressive Rules in Knowledge Graphs", journal = j-JDIQ, volume = "12", number = "2", pages = "8:1--8:27", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371315", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue May 19 09:08:07 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371315", abstract = "We describe RuDiK, an algorithm and a system for mining declarative rules over RDF knowledge graphs (KGs). RuDiK can discover rules expressing both positive relationships between KG elements, e.g., ``if two persons share at least one parent, they are \ldots{}''.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Haller:2020:WLL, author = "Armin Haller and Javier D. Fern{\'a}ndez and Maulik R. Kamdar and Axel Polleres", title = "What Are Links in Linked Open Data? {A} Characterization and Evaluation of Links between Knowledge Graphs on the {Web}", journal = j-JDIQ, volume = "12", number = "2", pages = "9:1--9:34", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369875", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue May 19 09:08:07 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369875", abstract = "Linked Open Data promises to provide guiding principles to publish interlinked knowledge graphs on the Web in the form of findable, accessible, interoperable, and reusable datasets. We argue that while as such, Linked Data may be viewed as a basis for \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Mountantonakis:2020:CBU, author = "Michalis Mountantonakis and Yannis Tzitzikas", title = "Content-based Union and Complement Metrics for Dataset Search over {RDF} Knowledge Graphs", journal = j-JDIQ, volume = "12", number = "2", pages = "10:1--10:31", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372750", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Apr 27 07:10:38 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372750", abstract = "RDF Knowledge Graphs (or Datasets) contain valuable information that can be exploited for a variety of real-world tasks. However, due to the enormous size of the available RDF datasets, it is difficult to discover the most valuable datasets for a given \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Bertossi:2020:DQE, author = "Leopoldo Bertossi and Floris Geerts", title = "Data Quality and Explainable {AI}", journal = j-JDIQ, volume = "12", number = "2", pages = "11:1--11:9", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3386687", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue May 19 09:08:07 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3386687", abstract = "In this work, we provide some insights and develop some ideas, with few technical details, about the role of explanations in Data Quality in the context of data-based machine learning models (ML). In this direction, there are, as expected, roles for \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Pitoura:2020:SMM, author = "Evaggelia Pitoura", title = "Social-minded Measures of Data Quality: Fairness, Diversity, and Lack of Bias", journal = j-JDIQ, volume = "12", number = "3", pages = "12:1--12:8", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3404193", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 30 07:16:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3404193", abstract = "For decades, research in data-driven algorithmic systems has focused on improving efficiency (making data access faster and lighter) and effectiveness (providing relevant results to users). As data-driven decision making becomes prevalent, there is an \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Colborne:2020:CDR, author = "Adrienne Colborne and Michael Smit", title = "Characterizing Disinformation Risk to Open Data in the Post-Truth Era", journal = j-JDIQ, volume = "12", number = "3", pages = "13:1--13:13", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3328747", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 30 07:16:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3328747", abstract = "Curated, labeled, high-quality data is a valuable commodity for tasks such as business analytics and machine learning. Open data is a common source of such data-for example, retail analytics draws on open demographic data, and weather forecast systems \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Blay:2020:IRF, author = "Karen Banahene Blay and Steven Yeomans and Peter Demian and Danny Murguia", title = "The Information Resilience Framework: Vulnerabilities, Capabilities, and Requirements", journal = j-JDIQ, volume = "12", number = "3", pages = "14:1--14:25", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388786", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 30 07:16:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3388786", abstract = "The quality of information is crucial to the success of asset delivery, management, and performance in the Digitised Architecture, Engineering, Construction, and Operations (DAECO) sector. The exposure and sensitivity of information to threats during \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Koumarelas:2020:DPD, author = "Ioannis Koumarelas and Lan Jiang and Felix Naumann", title = "Data Preparation for Duplicate Detection", journal = j-JDIQ, volume = "12", number = "3", pages = "15:1--15:24", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377878", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 30 07:16:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377878", abstract = "Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Visengeriyeva:2020:AMD, author = "Larysa Visengeriyeva and Ziawasch Abedjan", title = "Anatomy of Metadata for Data Curation", journal = j-JDIQ, volume = "12", number = "3", pages = "16:1--16:30", month = jul, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371925", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 30 07:16:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371925", abstract = "Real-world datasets often suffer from various data quality problems. Several data cleaning solutions have been proposed so far. However, data cleaning remains a manual and iterative task that requires domain and technical expertise. Exploiting metadata \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Polese:2020:ESI, author = "Giuseppe Polese and Vincenzo Deufemia and Shaoxu Song", title = "Editorial: Special Issue on Metadata Discovery for Assessing Data Quality", journal = j-JDIQ, volume = "12", number = "4", pages = "17:1--17:2", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3423321", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3423321", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Beneventano:2020:BET, author = "Domenico Beneventano and Sonia Bergamaschi and Luca Gagliardelli and Giovanni Simonini", title = "{BLAST2}: an Efficient Technique for Loose Schema Information Extraction from Heterogeneous Big Data Sources", journal = j-JDIQ, volume = "12", number = "4", pages = "18:1--18:22", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3394957", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3394957", abstract = "We present BLAST2, a novel technique to efficiently extract loose schema information, i.e., metadata that can serve as a surrogate of the schema alignment task within the Entity Resolution (ER) process, to identify records that refer to the same real-\ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Caruccio:2020:IDI, author = "Loredana Caruccio and Stefano Cirillo", title = "Incremental Discovery of Imprecise Functional Dependencies", journal = j-JDIQ, volume = "12", number = "4", pages = "19:1--19:25", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3397462", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3397462", abstract = "Functional dependencies (fds) are one of the metadata used to assess data quality and to perform data cleaning operations. However, to pursue robustness with respect to data errors, it has been necessary to devise imprecise versions of functional \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Maiolo:2020:DPP, author = "Sof{\'\i}a Maiolo and Lorena Etcheverry and Adriana Marotta", title = "Data Profiling in Property Graph Databases", journal = j-JDIQ, volume = "12", number = "4", pages = "20:1--20:27", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409473", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3409473", abstract = "Property Graph databases are being increasingly used within the industry as a powerful and flexible way to model real-world scenarios. With this flexibility, a great challenge appears regarding profiling tasks due to the need of adapting them to these \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Ahmadi:2020:RPC, author = "Naser Ahmadi and Thi-Thuy-Duyen Truong and Le-Hong-Mai Dao and Stefano Ortona and Paolo Papotti", title = "{RuleHub}: a Public Corpus of Rules for Knowledge Graphs", journal = j-JDIQ, volume = "12", number = "4", pages = "21:1--21:22", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409384", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3409384", abstract = "Entity-centric knowledge graphs (KGs) are now popular to collect facts about entities. KGs have rich schemas with a large number of different types and predicates to describe the entities and their relationships. On these rich schemas, logical rules are \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Lammel:2020:MHQ, author = "Philipp L{\"a}mmel and Benjamin Dittwald and Lina Bruns and Nikolay Tcholtchev and Yuri Glikman and Silke Cuno and Mathias Fl{\"u}gge and Ina Schieferdecker", title = "Metadata Harvesting and Quality Assurance within Open Urban Platforms", journal = j-JDIQ, volume = "12", number = "4", pages = "22:1--22:20", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409795", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3409795", abstract = "During the past years, various activities and concepts have shaped and prepared the path for the development of urban environments toward smart cities across the world. One of the initial activities was relating to the opening of vast amounts of data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Li:2021:DEM, author = "Yuliang Li and Jinfeng Li and Yoshihiko Suhara and Jin Wang and Wataru Hirota and Wang-Chiew Tan", title = "Deep Entity Matching: Challenges and Opportunities", journal = j-JDIQ, volume = "13", number = "1", pages = "1:1--1:17", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3431816", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Feb 10 10:35:23 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3431816", abstract = "Entity matching refers to the task of determining whether two different representations refer to the same real-world entity. It continues to be a prevalent problem for many organizations where data resides in different sources and duplicates the need to \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Loster:2021:KTE, author = "Michael Loster and Ioannis Koumarelas and Felix Naumann", title = "Knowledge Transfer for Entity Resolution with {Siamese} Neural Networks", journal = j-JDIQ, volume = "13", number = "1", pages = "2:1--2:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3410157", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Feb 10 10:35:23 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3410157", abstract = "The integration of multiple data sources is a common problem in a large variety of applications. Traditionally, handcrafted similarity measures are used to discover, merge, and integrate multiple representations of the same entity-duplicates-into a \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Neto:2021:DGD, author = "Nelson Novaes Neto and Stuart Madnick and Anchises Moraes G. {De Paula} and Natasha Malara Borges", title = "Developing a Global Data Breach Database and the Challenges Encountered", journal = j-JDIQ, volume = "13", number = "1", pages = "3:1--3:33", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3439873", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Feb 10 10:35:23 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3439873", abstract = "If the mantra ``data is the new oil'' of our digital economy is correct, then data leak incidents are the critical disasters in the online society. The initial goal of our research was to present a comprehensive database of data breaches of personal \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Fazzolari:2021:EIO, author = "Michela Fazzolari and Francesco Buccafurri and Gianluca Lax and Marinella Petrocchi", title = "Experience: Improving Opinion Spam Detection by Cumulative Relative Frequency Distribution", journal = j-JDIQ, volume = "13", number = "1", pages = "4:1--4:16", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3439307", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Feb 10 10:35:23 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3439307", abstract = "Over the past few years, online reviews have become very important, since they can influence the purchase decision of consumers and the reputation of businesses. Therefore, the practice of writing fake reviews can have severe consequences on customers \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Costa:2021:EQA, author = "Rog{\'e}rio Lu{\'\i}s C. Costa and Enrico Miranda and Paulo Dias and Jos{\'e} Moreira", title = "Experience: Quality Assessment and Improvement on a Forest Fire Dataset", journal = j-JDIQ, volume = "13", number = "1", pages = "5:1--5:13", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3428155", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Feb 10 10:35:23 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3428155", abstract = "Spatio-temporal data can be used to study and simulate the movement and behavior of objects and natural phenomena. However, the use of real-world data raises several challenges related to its acquisition, representation, and quality. This article \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Aljawarneh:2021:ESIa, author = "Shadi Aljawarneh and Juan A. Lara", title = "Editorial: Special Issue on Quality Assessment and Management in Big Data --- {Part I}", journal = j-JDIQ, volume = "13", number = "2", pages = "6:1--6:3", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3449052", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3449052", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Cummings:2021:SCM, author = "Mary L. Cummings and Songpo Li", title = "Subjectivity in the Creation of Machine Learning Models", journal = j-JDIQ, volume = "13", number = "2", pages = "7:1--7:19", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418034", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3418034", abstract = "Transportation analysts are inundated with requests to apply popular machine learning modeling techniques to datasets to uncover never-before-seen relationships that could potentially revolutionize safety, congestion, and mobility. However, the results \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Shah:2021:GBD, author = "Syed Iftikhar Hussain Shah and Vassilios Peristeras and Ioannis Magnisalis", title = "Government Big Data Ecosystem: Definitions, Types of Data, Actors, and Roles and the Impact in Public Administrations", journal = j-JDIQ, volume = "13", number = "2", pages = "8:1--8:25", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3425709", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3425709", abstract = "The public sector, private firms, business community, and civil society are generating data that are high in volume, veracity, and velocity and come from a diversity of sources. This type of data is today known as big data. Public administrations pursue \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Batayneh:2021:SSI, author = "Abeer A. {Al Batayneh} and Malik Qasaimeh and Raad S. Al-Qassas", title = "A Scoring System for Information Security Governance Framework Using Deep Learning Algorithms: a Case Study on the Banking Sector", journal = j-JDIQ, volume = "13", number = "2", pages = "9:1--9:34", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418172", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3418172", abstract = "Cybercrime reports showed an increase in the number of attacks targeting financial institutions. Indeed, banks were the target of 30\% of the total number of cyber-attacks. One of the recommended methods for driving the security challenges is to implement \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Fraihat:2021:BIF, author = "Salam Fraihat and Walid A. Salameh and Ammar Elhassan and Bushra Abu Tahoun and Maisa Asasfeh", title = "Business Intelligence Framework Design and Implementation: a Real-estate Market Case Study", journal = j-JDIQ, volume = "13", number = "2", pages = "10:1--10:16", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3422669", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3422669", abstract = "This article builds on previous work in the area of real-world applications of Business Intelligence (BI) technology. It illustrates the analysis, modeling, and framework design of a BI solution with high data quality to provide reliable analytics and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Khalemsky:2021:EDV, author = "A. Khalemsky and R. Gelbard", title = "{ExpanDrogram}: Dynamic Visualization of Big Data Segmentation over Time", journal = j-JDIQ, volume = "13", number = "2", pages = "11:1--11:27", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3434778", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3434778", abstract = "In dynamic and big data environments the visualization of a segmentation process over time often does not enable the user to simultaneously track entire pieces. The key points are sometimes incomparable, and the user is limited to a static visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Radhakrishna:2021:CPV, author = "Vangipuram Radhakrishna and Gali Suresh Reddy and Puligadda Veereswara Kumar and Vinjamuri Janaki", title = "Challenge Paper: The Vision for Time Profiled Temporal Association Mining", journal = j-JDIQ, volume = "13", number = "2", pages = "12:1--12:8", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3404198", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jul 1 08:31:27 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3404198", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Aljawarneh:2021:ESIb, author = "Shadi Aljawarneh and Juan A. Lara", title = "Editorial: Special Issue on Quality Assessment and Management in Big Data --- {Part II}", journal = j-JDIQ, volume = "13", number = "3", pages = "13:1--13:3", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3449056", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3449056", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{J:2021:HIM, author = "Sreelakshmy I. J. and Binsu C. Kovoor", title = "A Hybrid Inpainting Model Combining Diffusion and Enhanced Exemplar Methods", journal = j-JDIQ, volume = "13", number = "3", pages = "14:1--14:19", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418035", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3418035", abstract = "Image inpainting is a technique in the world of image editing where missing portions of the image are estimated and filled with the help of available or external information. In the proposed model, a novel hybrid inpainting algorithm is implemented, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chirkova:2021:EDR, author = "Rada Chirkova and Jon Doyle and Juan Reutter", title = "Ensuring Data Readiness for Quality Requirements with Help from Procedure Reuse", journal = j-JDIQ, volume = "13", number = "3", pages = "15:1--15:15", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3428154", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3428154", abstract = "Assessing and improving the quality of data are fundamental challenges in Big-Data applications. These challenges have given rise to numerous solutions targeting transformation, integration, and cleaning of data. However, while schema design, data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Joy:2021:OBL, author = "Jeevamol Joy and Nisha S. Raj and Renumol V. G.", title = "Ontology-based E-learning Content Recommender System for Addressing the Pure Cold-start Problem", journal = j-JDIQ, volume = "13", number = "3", pages = "16:1--16:27", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429251", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3429251", abstract = "E-learning recommender systems are gaining significance nowadays due to its ability to enhance the learning experience by providing tailor-made services based on learner preferences. A Personalized Learning Environment (PLE) that automatically adapts to \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Roy:2021:UNA, author = "Anurag Roy and Shalmoli Ghosh and Kripabandhu Ghosh and Saptarshi Ghosh", title = "An Unsupervised Normalization Algorithm for Noisy Text: a Case Study for Information Retrieval and Stance Detection", journal = j-JDIQ, volume = "13", number = "3", pages = "17:1--17:25", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3418036", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3418036", abstract = "A large fraction of textual data available today contains various types of ``noise,'' such as OCR noise in digitized documents, noise due to informal writing style of users on microblogging sites, and so on. To enable tasks such as search/retrieval and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Liu:2021:EAC, author = "Zhicheng Liu and Yang Zhang and Ruihong Huang and Zhiwei Chen and Shaoxu Song and Jianmin Wang", title = "{EXPERIENCE}: Algorithms and Case Study for Explaining Repairs with Uniform Profiles over {IoT} Data", journal = j-JDIQ, volume = "13", number = "3", pages = "18:1--18:17", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436239", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Aug 2 15:58:12 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3436239", abstract = "IoT data with timestamps are often found with outliers, such as GPS trajectories or sensor readings. While existing systems mostly focus on detecting temporal outliers without explanations and repairs, a decision maker may be more interested in the cause \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Kubiczek:2021:CRC, author = "Jakub Kubiczek and BartLomiej Hadasik", title = "Challenges in Reporting the {COVID-19} Spread and its Presentation to the Society", journal = j-JDIQ, volume = "13", number = "4", pages = "19:1--19:7", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3470851", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3470851", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Tufis:2021:TCD, author = "Mihnea Tufis and Ludovico Boratto", title = "Toward a Complete Data Valuation Process. Challenges of Personal Data", journal = j-JDIQ, volume = "13", number = "4", pages = "20:1--20:7", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447269", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3447269", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Nayak:2021:EAP, author = "Stuti Nayak and Amrapali Zaveri and Pedro Hernandez Serrano and Michel Dumontier", title = "Experience: Automated Prediction of Experimental Metadata from Scientific Publications", journal = j-JDIQ, volume = "13", number = "4", pages = "21:1--21:11", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451219", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3451219", abstract = "While there exists an abundance of open biomedical data, the lack of high-quality metadata makes it challenging for others to find relevant datasets and to reuse them for another purpose. In particular, metadata are useful to understand the nature and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chen:2021:BBA, author = "Jessica Chen and Henry Milner and Ion Stoica and Jibin Zhan", title = "Benchmark of Bitrate Adaptation in Video Streaming", journal = j-JDIQ, volume = "13", number = "4", pages = "22:1--22:24", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468063", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3468063", abstract = "The HTTP adaptive streaming technique opened the door to cope with the fluctuating network conditions during the streaming process by dynamically adjusting the volume of the future chunks to be downloaded. The bitrate selection in this adjustment \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Amaral:2021:AQS, author = "Gabriel Amaral and Alessandro Piscopo and Lucie-aim{\'e}e Kaffee and Odinaldo Rodrigues and Elena Simperl", title = "Assessing the Quality of Sources in {Wikidata} Across Languages: a Hybrid Approach", journal = j-JDIQ, volume = "13", number = "4", pages = "23:1--23:35", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3484828", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Nov 3 09:43:30 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3484828", abstract = "Wikidata is one of the most important sources of structured data on the web, built by a worldwide community of volunteers. As a secondary source, its contents must be backed by credible references; this is particularly important, as Wikidata explicitly \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "23", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Barhamgi:2022:ESIa, author = "Mahmoud Barhamgi and Elisa Bertino", title = "Editorial: Special Issue on Data Transparency-Data Quality, Annotation, and Provenance", journal = j-JDIQ, volume = "14", number = "1", pages = "1:1--1:3", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494454", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Feb 3 06:14:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3494454", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Thirumuruganathan:2022:AAA, author = "Saravanan Thirumuruganathan and Mayuresh Kunjir and Mourad Ouzzani and Sanjay Chawla", title = "Automated Annotations for {AI} Data and Model Transparency", journal = j-JDIQ, volume = "14", number = "1", pages = "2:1--2:9", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460000", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Feb 3 06:14:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460000", abstract = "The data and Artificial Intelligence revolution has had a massive impact on enterprises, governments, and society alike. It is fueled by two key factors. First, data have become increasingly abundant and are often available openly. Enterprises have more \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Geisler:2022:KDD, author = "Sandra Geisler and Maria-Esther Vidal and Cinzia Cappiello and Bernadette Farias L{\'o}scio and Avigdor Gal and Matthias Jarke and Maurizio Lenzerini and Paolo Missier and Boris Otto and Elda Paja and Barbara Pernici and Jakob Rehof", title = "Knowledge-Driven Data Ecosystems Toward Data Transparency", journal = j-JDIQ, volume = "14", number = "1", pages = "3:1--3:12", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3467022", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Feb 3 06:14:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3467022", abstract = "A data ecosystem (DE) offers a keystone-player or alliance-driven infrastructure that enables the interaction of different stakeholders and the resolution of interoperability issues among shared data. However, despite years of research in data governance \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Belhajjame:2022:AWP, author = "Khalid Belhajjame", title = "On the Anonymization of Workflow Provenance without Compromising the Transparency of Lineage", journal = j-JDIQ, volume = "14", number = "1", pages = "4:1--4:27", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460207", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Feb 3 06:14:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460207", abstract = "Workflows have been adopted in several scientific fields as a tool for the specification and execution of scientific experiments. In addition to automating the execution of experiments, workflow systems often include capabilities to record provenance \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Dargahi:2022:IBC, author = "Tooska Dargahi and Hossein Ahmadvand and Mansour Naser Alraja and Chia-Mu Yu", title = "Integration of Blockchain with Connected and Autonomous Vehicles: Vision and Challenge", journal = j-JDIQ, volume = "14", number = "1", pages = "5:1--5:10", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460003", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Feb 3 06:14:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460003", abstract = "Connected and Autonomous Vehicles (CAVs) are introduced to improve individuals' quality of life by offering a wide range of services. They collect a huge amount of data and exchange them with each other and the infrastructure. The collected data usually includes sensitive information about the users and the surrounding environment. Therefore, data security and privacy are among the main challenges in this industry. Blockchain, an emerging distributed ledger, has been considered by the research community as a potential solution for enhancing data security, integrity, and transparency in Intelligent Transportation Systems (ITS). However, despite the emphasis of governments on the transparency of personal data protection practices, CAV stakeholders have not been successful in communicating appropriate information with the end users regarding the procedure of collecting, storing, and processing their personal data, as well as the data ownership. This article provides a vision of the opportunities and challenges of adopting blockchain in ITS from the ``data transparency'' and ``privacy'' perspective. The main aim is to answer the following questions: (1) Considering the amount of personal data collected by the CAVs, such as location, how would the integration of blockchain technology affect transparency, fairness, and lawfulness of personal data processing concerning the data subjects (as this is one of the main principles in the existing data protection regulations)? (2) How can the trade-off between transparency and privacy be addressed in blockchain-based ITS use cases?", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Barhamgi:2022:ESIb, author = "Mahmoud Barhamgi and Elisa Bertino", title = "Editorial: Special Issue on Data Transparency-Uses Cases and Applications", journal = j-JDIQ, volume = "14", number = "2", pages = "6:1--6:3", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494455", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3494455", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Badr:2022:DTF, author = "Youakim Badr and Rahul Sharma", title = "Data Transparency and Fairness Analysis of the {NYPD Stop-and-Frisk Program}", journal = j-JDIQ, volume = "14", number = "2", pages = "7:1--7:14", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460533", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460533", abstract = "Given the increased concern of racial disparities in the stop-and-frisk programs, the New York Police Department (NYPD) requires publicly displaying detailed data for all the stops conducted by police authorities, including the suspected offense and race \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chen:2022:ATR, author = "Chien-Lun Chen and Leana Golubchik and Ranjan Pal", title = "Achieving Transparency Report Privacy in Linear Time", journal = j-JDIQ, volume = "14", number = "2", pages = "8:1--8:56", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460001", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460001", abstract = "An accountable algorithmic transparency report (ATR) should ideally investigate (a) transparency of the underlying algorithm, and (b) fairness of the algorithmic decisions, and at the same time preserve data subjects' privacy. However, a provably formal \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Mauri:2022:EDM, author = "Lara Mauri and Ernesto Damiani", title = "Estimating Degradation of Machine Learning Data Assets", journal = j-JDIQ, volume = "14", number = "2", pages = "9:1--9:15", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3446331", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3446331", abstract = "Large-scale adoption of Artificial Intelligence and Machine Learning (AI-ML) models fed by heterogeneous, possibly untrustworthy data sources has spurred interest in estimating degradation of such models due to spurious, adversarial, or low-quality data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wang:2022:TAL, author = "Bin Wang and Pengfei Guo and Xing Wang and Yongzhong He and Wei Wang", title = "Transparent Aspect-Level Sentiment Analysis Based on Dependency Syntax Analysis and Its Application on {COVID-19}", journal = j-JDIQ, volume = "14", number = "2", pages = "10:1--10:24", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3460002", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3460002", abstract = "Aspect-level sentiment analysis identifies fine-grained emotion for target words. There are three major issues in current models of aspect-level sentiment analysis. First, few models consider the natural language semantic characteristics of the texts. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Hsu:2022:EAM, author = "Che-Yun Hsu and Ting-Rui Chen and Hung-Hsuan Chen", title = "Experience: Analyzing Missing {Web} Page Visits and Unintentional {Web} Page Visits from the Client-side {Web} Logs", journal = j-JDIQ, volume = "14", number = "2", pages = "11:1--11:17", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3490392", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3490392", abstract = "Web logs have been widely used to represent the web page visits of online users. However, we found that web logs in Chrome's browsing history only record 57\% of users' visited websites, i.e., nearly half of a user's website visits are not recorded. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Patnaik:2022:WIE, author = "Sudhir Kumar Patnaik and C. Narendra Babu", title = "A {Web} Information Extraction Framework with Adaptive and Failure Prediction Feature", journal = j-JDIQ, volume = "14", number = "2", pages = "12:1--12:21", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495008", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Apr 23 13:23:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3495008", abstract = "The amount of information available on the internet today requires effective information extraction and processing to offer hyper-personalized user experiences. Inability to extract information by using traditional and machine learning techniques due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Ilyas:2022:MLD, author = "Ihab F. Ilyas and Theodoros Rekatsinas", title = "Machine Learning and Data Cleaning: Which Serves the Other?", journal = j-JDIQ, volume = "14", number = "3", pages = "13:1--13:11", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506712", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3506712", abstract = "The last few years witnessed significant advances in building automated or semi-automated data quality, data cleaning and data integration systems powered by machine learning (ML). In parallel, large deployment of ML systems in business, science, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Santoro:2022:ESI, author = "Donatello Santoro and Saravanan Thirumuruganathan and Paolo Papotti", title = "Editorial: Special Issue on Deep Learning for Data Quality", journal = j-JDIQ, volume = "14", number = "3", pages = "14:1--14:3", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3513135", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3513135", abstract = "This editorial summarizes the content of the Special Issue on Deep Learning for Data Quality of the Journal of Data and Information Quality (JDIQ).", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wu:2022:CTL, author = "Renzhi Wu and Nilaksh Das and Sanya Chaba and Sakshi Gandhi and Duen Horng Chau and Xu Chu", title = "A Cluster-then-label Approach for Few-shot Learning with Application to Automatic Image Data Labeling", journal = j-JDIQ, volume = "14", number = "3", pages = "15:1--15:23", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491232", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3491232", abstract = "Few-shot learning (FSL) aims at learning to generalize from only a small number of labeled examples for a given target task. Most current state-of-the-art FSL methods typically have two limitations. First, they usually require access to a source dataset \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Shraga:2022:PQA, author = "Roee Shraga and Avigdor Gal", title = "{PoWareMatch}: a Quality-aware Deep Learning Approach to Improve Human Schema Matching", journal = j-JDIQ, volume = "14", number = "3", pages = "16:1--16:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3483423", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3483423", abstract = "Schema matching is a core task of any data integration process. Being investigated in the fields of databases, AI, Semantic Web, and data mining for many years, the main challenge remains the ability to generate quality matches among data concepts (e.g., \ldots{}).", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Haque:2022:NIC, author = "Md Enamul Haque and Mehmet Engin Tozal", title = "Negative Insurance Claim Generation Using Distance Pooling on Positive Diagnosis-Procedure Bipartite Graphs", journal = j-JDIQ, volume = "14", number = "3", pages = "17:1--17:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3531347", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3531347", abstract = "Negative samples in health and medical insurance domain refer to fraudulent or erroneous insurance claims that may include inconsistent diagnosis-procedure relations with respect to a medical coding system. Unfortunately, only a few datasets are publicly \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Demetrescu:2022:WCC, author = "Camil Demetrescu and Irene Finocchi and Andrea Ribichini and Marco Schaerf", title = "Which Conference Is That? {A} Case Study in Computer Science", journal = j-JDIQ, volume = "14", number = "3", pages = "18:1--18:13", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3519031", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3519031", abstract = "Conferences play a major role in some disciplines such as computer science and are often used in research quality evaluation exercises. Differently from journals and books, for which ISSN and ISBN codes provide unambiguous keys, recognizing the conference \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Gram:2022:DIH, author = "Dennis Gram and Pantelis Karapanagiotis and Marius Liebald and Uwe Walz", title = "Design and Implementation of a Historical {German} Firm-level Financial Database", journal = j-JDIQ, volume = "14", number = "3", pages = "19:1--19:22", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3531533", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3531533", abstract = "Broad, long-term financial, and economic datasets are scarce resources, particularly in the European context. In this article, we present an approach for an extensible data model that is adaptable to future changes in technologies and sources. This model \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Zheng:2022:CDC, author = "Zheng Zheng and Longtao Zheng and Morteza Alipourlangouri and Fei Chiang and Lukasz Golab and Jaroslaw Szlichta and Sridevi Baskaran", title = "Contextual Data Cleaning with Ontology Functional Dependencies", journal = j-JDIQ, volume = "14", number = "3", pages = "20:1--20:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3524303", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3524303", abstract = "Functional Dependencies define attribute relationships based on syntactic equality, and when used in data cleaning, they erroneously label syntactically different but semantically equivalent values as errors. We explore dependency-based data cleaning with \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Hacker:2022:ACC, author = "Philipp Hacker and Felix Naumann and Tobias Friedrich and Stefan Grundmann and Anja Lehmann and Herbert Zech", title = "{AI} Compliance --- Challenges of Bridging Data Science and Law", journal = j-JDIQ, volume = "14", number = "3", pages = "21:1--21:4", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3531532", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Aug 10 06:32:51 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3531532", abstract = "This vision article outlines the main building blocks of what we term AI Compliance, an effort to bridge two complementary research areas: computer science and the law. Such research has the goal to model, measure, and affect the quality of AI artifacts, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Li:2022:DCC, author = "Yuanxia Li and Faiz Currim and Sudha Ram", title = "Data Completeness and Complex Semantics in Conceptual Modeling: The Need for a Disaggregation Construct", journal = j-JDIQ, volume = "14", number = "4", pages = "22:1--22:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3532784", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3532784", abstract = "Conceptual modeling is important for developing databases that maintain the integrity and quality of stored information. However, classical conceptual models have often been assumed to work on well-maintained and high-quality data. With the advancement \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Johnson:2022:SCB, author = "Justin M. Johnson and Taghi M. Khoshgoftaar", title = "A Survey on Classifying Big Data with Label Noise", journal = j-JDIQ, volume = "14", number = "4", pages = "23:1--23:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3492546", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3492546", abstract = "Class label noise is a critical component of data quality that directly inhibits the predictive performance of machine learning algorithms. While many data-level and algorithm-level methods exist for treating label noise, the challenges associated with \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "23", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Firmani:2022:ESI, author = "Donatella Firmani and Letizia Tanca and Riccardo Torlone", title = "Editorial: Special Issue on Data Quality and Ethics", journal = j-JDIQ, volume = "14", number = "4", pages = "24:1--24:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3561202", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3561202", abstract = "This editorial summarizes the content of the Special Issue on Data Quality and Ethics of the Journal of Data and Information Quality (JDIQ). The issue accepted submissions from June 1 to July 30, 2021.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "24", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Mecati:2022:DRB, author = "Mariachiara Mecati and Antonio Vetr{\`o} and Marco Torchiano", title = "Detecting Risk of Biased Output with Balance Measures", journal = j-JDIQ, volume = "14", number = "4", pages = "25:1--25:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3530787", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3530787", abstract = "Data have become a fundamental element of the management and productive infrastructures of our society, fuelling digitization of organizational and decision-making processes at an impressive speed. This transition shows lights and shadows, and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "25", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Accinelli:2022:CBA, author = "Chiara Accinelli and Barbara Catania and Giovanna Guerrini and Simone Minisi", title = "A Coverage-based Approach to Nondiscrimination-aware Data Transformation", journal = j-JDIQ, volume = "14", number = "4", pages = "26:1--26:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546913", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546913", abstract = "The development of technological solutions satisfying nondiscriminatory requirements is one of the main current challenges for data processing. Back-end operators for preparing, i.e., extracting and transforming, data play a relevant role w.r.t. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "26", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Jagadish:2022:MFD, author = "H. Jagadish and Julia Stoyanovich and Bill Howe", title = "The Many Facets of Data Equity", journal = j-JDIQ, volume = "14", number = "4", pages = "27:1--27:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3533425", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3533425", abstract = "Data-driven systems can induce, operationalize, and amplify systemic discrimination in a variety of ways. As data scientists, we tend to prefer to isolate and formalize equity problems to make them amenable to narrow technical solutions. However, this \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "27", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Mazilu:2022:FAD, author = "Lacramioara Mazilu and Norman W. Paton and Nikolaos Konstantinou and Alvaro A. A. Fernandes", title = "Fairness-aware Data Integration", journal = j-JDIQ, volume = "14", number = "4", pages = "28:1--28:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3519419", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3519419", abstract = "Machine learning can be applied in applications that take decisions that impact people's lives. Such techniques have the potential to make decision making more objective, but there also is a risk that the decisions can discriminate against certain groups \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "28", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Azzalini:2022:FDF, author = "Fabio Azzalini and Chiara Criscuolo and Letizia Tanca", title = "{E-FAIR-DB}: Functional Dependencies to Discover Data Bias and Enhance Data Equity", journal = j-JDIQ, volume = "14", number = "4", pages = "29:1--29:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3552433", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:10 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3552433", abstract = "Decisions based on algorithms and systems generated from data have become essential tools that pervade all aspects of our daily lives; for these advances to be reliable, the results should be accurate but should also respect all the facets of data equity \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "29", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wright:2023:ISI, author = "Dustin Wright and Paolo Papotti and Isabelle Augenstein", title = "Introduction to the Special Issue on Truth and Trust Online", journal = j-JDIQ, volume = "15", number = "1", pages = "1:1--1:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3578242", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3578242", abstract = "This editorial summarizes the content of the Special Issue on Truth and Trust Online of the Journal of Data and Information Quality. We thank the authors for their exceptional contributions to this special issue.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Gausen:2023:UAB, author = "Anna Gausen and Wayne Luk and Ce Guo", title = "Using Agent-Based Modelling to Evaluate the Impact of Algorithmic Curation on Social Media", journal = j-JDIQ, volume = "15", number = "1", pages = "2:1--2:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546915", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546915", abstract = "Social media networks have drastically changed how people communicate and seek information. Due to the scale of information on these platforms, newsfeed curation algorithms have been developed to sort through this information and curate what users see. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Stammbach:2023:CTK, author = "Dominik Stammbach and Boya Zhang and Elliott Ash", title = "The Choice of Textual Knowledge Base in Automated Claim Checking", journal = j-JDIQ, volume = "15", number = "1", pages = "3:1--3:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561389", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3561389", abstract = "Automated claim checking is the task of determining the veracity of a claim given evidence retrieved from a textual knowledge base of trustworthy facts. While previous work has taken the knowledge base as given and optimized the claim-checking pipeline, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Brand:2023:NMJ, author = "Erik Brand and Kevin Roitero and Michael Soprano and Afshin Rahimi and Gianluca Demartini", title = "A Neural Model to Jointly Predict and Explain Truthfulness of Statements", journal = j-JDIQ, volume = "15", number = "1", pages = "4:1--4:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546917", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546917", abstract = "Automated fact-checking (AFC) systems exist to combat disinformation, however, their complexity usually makes them opaque to the end-user, making it difficult to foster trust in the system. In this article, we introduce the E-BART model with the hope of \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Qu:2023:CHM, author = "Yunke Qu and Kevin Roitero and David {La Barbera} and Damiano Spina and Stefano Mizzaro and Gianluca Demartini", title = "Combining Human and Machine Confidence in Truthfulness Assessment", journal = j-JDIQ, volume = "15", number = "1", pages = "5:1--5:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546916", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546916", abstract = "Automatically detecting online misinformation at scale is a challenging and interdisciplinary problem. Deciding what is to be considered truthful information is sometimes controversial and also difficult for educated experts. As the scale of the problem \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Anuchitanukul:2023:RCT, author = "Atijit Anuchitanukul and Julia Ive and Lucia Specia", title = "Revisiting Contextual Toxicity Detection in Conversations", journal = j-JDIQ, volume = "15", number = "1", pages = "6:1--6:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561390", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3561390", abstract = "Understanding toxicity in user conversations is undoubtedly an important problem. Addressing ``covert'' or implicit cases of toxicity is particularly hard and requires context. Very few previous studies have analysed the influence of conversational context \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Panda:2023:DDW, author = "Subhadarshi Panda and Sarah Levitan", title = "Deception Detection Within and Across Domains: Identifying and Understanding the Performance Gap", journal = j-JDIQ, volume = "15", number = "1", pages = "7:1--7:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561413", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3561413", abstract = "NLP approaches to automatic deception detection have gained popularity over the past few years, especially with the proliferation of fake reviews and fake news online. However, most previous studies of deception detection have focused on single domains. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Senaratne:2023:UIA, author = "Asara Senaratne and Peter Christen and Graham Williams and Pouya G. Omran", title = "Unsupervised Identification of Abnormal Nodes and Edges in Graphs", journal = j-JDIQ, volume = "15", number = "1", pages = "8:1--8:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546912", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546912", abstract = "Much of today's data are represented as graphs, ranging from social networks to bibliographic citations. Nodes in such graphs correspond to records that generally represent entities, while edges represent relationships between these entities. Both nodes \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Zuo:2023:SSP, author = "Chaoyuan Zuo and Ritwik Banerjee and Fateme Hashemi Chaleshtori and Hossein Shirazi and Indrakshi Ray", title = "Seeing Should Probably Not Be Believing: The Role of Deceptive Support in {COVID-19} Misinformation on {Twitter}", journal = j-JDIQ, volume = "15", number = "1", pages = "9:1--9:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3546914", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Mar 9 08:17:11 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3546914", abstract = "With the spread of the SARS-CoV-2, enormous amounts of information about the pandemic are disseminated through social media platforms such as Twitter. Social media posts often leverage the trust readers have in prestigious news agencies and cite news \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Navigli:2023:BLL, author = "Roberto Navigli and Simone Conia and Bj{\"o}rn Ross", title = "Biases in Large Language Models: Origins, Inventory, and Discussion", journal = j-JDIQ, volume = "15", number = "2", pages = "10:1--10:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597307", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597307", abstract = "In this article, we introduce and discuss the pervasive issue of bias in the large language models that are currently at the core of mainstream approaches to Natural Language Processing (NLP). We first introduce data selection bias, that is, the bias \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Priestley:2023:SDQ, author = "Maria Priestley and Fionnt{\'a}n O'donnell and Elena Simperl", title = "A Survey of Data Quality Requirements That Matter in {ML} Development Pipelines", journal = j-JDIQ, volume = "15", number = "2", pages = "11:1--11:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3592616", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3592616", abstract = "The fitness of the systems in which Machine Learning (ML) is used depends greatly on good-quality data. Specifications on what makes a good-quality dataset have traditionally been defined by the needs of the data users-typically analysts and engineers. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Simon:2023:CCA, author = "Eric Simon and Bernd Amann and Rutian Liu and St{\'e}phane Gan{\c{c}}arski", title = "Controlling the Correctness of Aggregation Operations During Sessions of Interactive Analytic Queries", journal = j-JDIQ, volume = "15", number = "2", pages = "12:1--12:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575812", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3575812", abstract = "We present a comprehensive set of conditions and rules to control the correctness of aggregation queries within an interactive data analysis session. The goal is to extend self-service data preparation and Business Intelligence (BI) tools to automatically \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Skavantzos:2023:UCO, author = "Philipp Skavantzos and Uwe Leck and Kaiqi Zhao and Sebastian Link", title = "Uniqueness Constraints for Object Stores", journal = j-JDIQ, volume = "15", number = "2", pages = "13:1--13:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3581758", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3581758", abstract = "Object stores offer an increasingly popular choice for data management and analytics. As with every data model, managing the integrity of objects is fundamental for data quality but also important for the efficiency of update and query operations. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Smith:2023:LSA, author = "Duncan Smith and Mark Elliot and Joseph W. Sakshaug", title = "To Link or Synthesize? {An} Approach to Data Quality Comparison", journal = j-JDIQ, volume = "15", number = "2", pages = "14:1--14:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3580487", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3580487", abstract = "Linking administrative data to produce more informative data for subsequent analysis has become an increasingly common practice. However, there might be concomitant risks of disclosing sensitive information about individuals. One practice that reduces \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Ao:2023:TPR, author = "Jing Ao and Zehui Cheng and Rada Chirkova and Phokion G. Kolaitis", title = "Theory and Practice of Relational-to-{RDF} Temporal Data Exchange and Query Answering", journal = j-JDIQ, volume = "15", number = "2", pages = "15:1--15:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3591359", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3591359", abstract = "We consider the problem of answering temporal queries on RDF stores, in presence of atemporal RDFS domain ontologies, of relational data sources that include temporal information, and of rules that map the domain information in the source schemas into the \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Timko:2023:IMD, author = "Christina Timko and Malte Niederstadt and Naman Goel and Boi Faltings", title = "Incentive Mechanism Design for Responsible Data Governance: a Large-scale Field Experiment", journal = j-JDIQ, volume = "15", number = "2", pages = "16:1--16:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3592617", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3592617", abstract = "A crucial building block of responsible artificial intelligence is responsible data governance, including data collection. Its importance is also underlined in the latest EU regulations. The data should be of high quality, foremost correct and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Simard:2023:MCD, author = "Vanessa Simard and Mikael R{\"o}nnqvist and Luc Lebel and Nadia Lehoux", title = "A Method to Classify Data Quality for Decision Making Under Uncertainty", journal = j-JDIQ, volume = "15", number = "2", pages = "17:1--17:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3592534", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3592534", abstract = "Every decision-making process is subject to a certain degree of uncertainty. In sectors where the outcomes of the operations planned are uncertain and difficult to control such as in forestry, data describing the available resources can have a large \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Tawakuli:2023:EDB, author = "Amal Tawakuli and Daniel Kaiser and Thomas Engel", title = "Experience: Differentiating Between Isolated and Sequence Missing Data", journal = j-JDIQ, volume = "15", number = "2", pages = "18:1--18:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575809", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3575809", abstract = "Missing data is one of the most persistent problems found in data that hinders information and value extraction. Handling missing data is a preprocessing task that has been extensively studied by the research community and remains an active research topic \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Srivastava:2023:ESI, author = "Gautam Srivastava and Jerry Chun-Wei Lin and Zhihan Lv", title = "Editorial for the Special Issue on Quality Assessment of Data Security", journal = j-JDIQ, volume = "15", number = "2", pages = "19:1--19:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3591360", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3591360", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Hoffpauir:2023:SEI, author = "Kyle Hoffpauir and Jacob Simmons and Nikolas Schmidt and Rachitha Pittala and Isaac Briggs and Shanmukha Makani and Yaser Jararweh", title = "A Survey on Edge Intelligence and Lightweight Machine Learning Support for Future Applications and Services", journal = j-JDIQ, volume = "15", number = "2", pages = "20:1--20:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3581759", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3581759", abstract = "As the number of devices connected to the Internet has grown larger, so too has the intensity of the tasks that these devices need to perform. Modern networks are more frequently working to perform computationally intensive tasks on low-power devices and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Singh:2023:IEC, author = "Kedar Nath Singh and Amit Kumar Singh", title = "An Improved Encryption-Compression-based Algorithm for Securing Digital Images", journal = j-JDIQ, volume = "15", number = "2", pages = "21:1--21:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3532783", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3532783", abstract = "Nowadays, there is an increasing tendency to upload images to online platforms acting as information carriers for various applications. Unfortunately, the unauthorized utilization of such images is a serious concern that has significantly impacted \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Supriya:2023:SSC, author = "Y. Supriya and Thippa Reddy Gadekallu", title = "A Survey on Soft Computing Techniques for Federated Learning --- Applications, Challenges and Future Directions", journal = j-JDIQ, volume = "15", number = "2", pages = "22:1--22:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575810", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3575810", abstract = "Federated Learning is a distributed, privacy-preserving machine learning model that is gaining more attention these days. Federated Learning has a vast number of applications in different fields. While being more popular, it also suffers some drawbacks \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chatterjee:2023:MRS, author = "Kakali Chatterjee and Ashish Singh and Neha and Keping Yu", title = "A Multifactor Ring Signature based Authentication Scheme for Quality Assessment of {IoMT} Environment in {COVID-19} Scenario", journal = j-JDIQ, volume = "15", number = "2", pages = "23:1--23:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575811", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3575811", abstract = "The quality of the healthcare environment has become an essential factor for healthcare users to access quality services. Smart healthcare systems use the Internet of Medical Things (IoMT) devices to capture patients' health data for treatment or \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "23", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Kumar:2023:EEC, author = "Gautam Kumar and Sambit Bakshi and Arun Kumar Sangaiah and Pankaj Kumar Sa", title = "Experimental Evaluation of Covariates Effects on Periocular Biometrics: a Robust Security Assessment Framework", journal = j-JDIQ, volume = "15", number = "2", pages = "24:1--24:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579029", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Jul 1 13:31:36 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3579029", abstract = "The growing integration of technology into our lives has resulted in unprecedented amounts of data that are being exchanged among devices in an Internet of Things (IoT) environment. Authentication, identification, and device heterogeneities are major \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "24", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Fadlallah:2023:CAB, author = "Hadi Fadlallah and Rima Kilany and Houssein Dhayne and Rami {El Haddad} and Rafiqul Haque and Yehia Taher and Ali Jaber", title = "Context-aware Big Data Quality Assessment: a Scoping Review", journal = j-JDIQ, volume = "15", number = "3", pages = "25:1--25:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603707", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603707", abstract = "The term data quality refers to measuring the fitness of data regarding the intended usage. Poor data quality leads to inadequate, inconsistent, and erroneous decisions that could escalate the computational cost, cause a decline in profits, and cause \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "25", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Irrera:2023:NCS, author = "Ornella Irrera and Andrea Mannocci and Paolo Manghi and Gianmaria Silvello", title = "A Novel Curated Scholarly Graph Connecting Textual and Data Publications", journal = j-JDIQ, volume = "15", number = "3", pages = "26:1--26:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597310", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597310", abstract = "In the last decade, scholarly graphs became fundamental to storing and managing scholarly knowledge in a structured and machine-readable way. Methods and tools for discovery and impact assessment of science rely on such graphs and their quality to serve \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "26", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Fadlallah:2023:BDB, author = "Hadi Fadlallah and Rima Kilany and Houssein Dhayne and Rami {El Haddad} and Rafiqul Haque and Yehia Taher and Ali Jaber", title = "{BIGQA}: Declarative Big Data Quality Assessment", journal = j-JDIQ, volume = "15", number = "3", pages = "27:1--27:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603706", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603706", abstract = "In the big data domain, data quality assessment operations are often complex and must be implementable in a distributed and timely manner. This article tries to generalize the quality assessment operations by providing a new ISO-based declarative data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "27", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wenz:2023:CHD, author = "Viola Wenz and Arno Kesper and Gabriele Taentzer", title = "Clustering Heterogeneous Data Values for Data Quality Analysis", journal = j-JDIQ, volume = "15", number = "3", pages = "28:1--28:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603710", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603710", abstract = "Data is of high quality if it is fit for its intended purpose. Data heterogeneity can be a major quality problem, as quality aspects such as understandability and consistency can be compromised. Heterogeneity of data values is particularly common when \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "28", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Hofstede:2023:PDQ, author = "Arthur H. M. Ter Hofstede and Agnes Koschmider and Andrea Marrella and Robert Andrews and Dominik A. Fischer and Sareh Sadeghianasl and Moe Thandar Wynn and Marco Comuzzi and Jochen {De Weerdt} and Kanika Goel and Niels Martin and Pnina Soffer", title = "Process-Data Quality: The True Frontier of Process Mining", journal = j-JDIQ, volume = "15", number = "3", pages = "29:1--29:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3613247", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3613247", abstract = "Since its emergence over two decades ago, process mining has flourished as a discipline, with numerous contributions to its theory, widespread practical applications, and mature support by commercial tooling environments. However, its potential for \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "29", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chakraborty:2023:EMM, author = "Chinmay Chakraborty and Mohammad Khosravi and Muhammad Khurram Khan and Houbing Herbert Song", title = "Editorial: Multimodality, Multidimensional Representation, and Multimedia Quality Assessment Toward Information Quality in Social {Web} of Things", journal = j-JDIQ, volume = "15", number = "3", pages = "30:1--30:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3625102", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3625102", abstract = "This editorial summarizes the content of the collection on Multimodality, Multidimensional Representation, and Multimedia Quality Assessment Toward Information Quality in Social Web of Things for the Journal of Data and Information Quality.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "30", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Aume:2023:MSD, author = "Cameron Aume and Shantanu Pal and Alireza Jolfaei and Subhas Mukhopadhyay", title = "Multimodal Social Data Analytics on the Design and Implementation of an {EEG}-Mechatronic System Interface", journal = j-JDIQ, volume = "15", number = "3", pages = "31:1--31:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597306", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597306", abstract = "The devices that can read Electroencephalography (EEG) signals have been widely used for Brain-Computer Interfaces (BCIs). Popularity in the field of BCIs has increased in recent years with the development of several consumer-grade EEG devices that can \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "31", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Jing:2023:SCT, author = "Yang Jing and Ma Haowei and Arshiya S. Ansari and G. Sucharitha and Batyrkhan Omarov and Sandeep Kumar and Mohammad Sajid Mohammadi and Khaled A. Z. Alyamani", title = "Soft Computing Techniques for Detecting Cyberbullying in Social Multimedia Data", journal = j-JDIQ, volume = "15", number = "3", pages = "32:1--32:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3604617", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3604617", abstract = "Cyberbullying is a form of abuse, manipulation, or humiliation directed against a single person via the Internet. CB makes use of nasty Internet comments and remarks. It occurs when someone publicly mocks, insults, slanders, criticizes, or mocks another \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "32", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Matrouk:2023:DLB, author = "Khaled Matrouk and {Srikanth V} and Sumit Kumar and Mohit Kumar Bhadla and Mirza Sabirov and Mohamed J. Saadh", title = "Deep Learning-based Dynamic User Alignment in Social Networks", journal = j-JDIQ, volume = "15", number = "3", pages = "33:1--33:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603711", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603711", abstract = "Academics and businesses are paying intense attention to social network alignment, which centers various social networks around their shared members. All studies to date treat the social network as static and ignore its innate dynamism. In reality, an \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "33", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Martin:2023:FBR, author = "R. John Martin and Rajvardhan Oak and Mukesh Soni and V. Mahalakshmi and Arsalan Muhammad Soomar and Anjali Joshi", title = "Fusion-based Representation Learning Model for Multimode User-generated Social Network Content", journal = j-JDIQ, volume = "15", number = "3", pages = "34:1--34:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603712", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603712", abstract = "As mobile networks and APPs are developed, user-generated content (UGC), which includes multi-source heterogeneous data like user reviews, tags, scores, images, and videos, has become an essential basis for improving the quality of personalized services. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "34", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Attar:2023:JIM, author = "Hani Attar", title = "Joint {IoT\slash ML} Platforms for Smart Societies and Environments: a Review on Multimodal Information-Based Learning for Safety and Security", journal = j-JDIQ, volume = "15", number = "3", pages = "35:1--35:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603713", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603713", abstract = "The application of the Internet of Things (IoT) is highly expected to have comprehensive economic, business, and societal implications for our smart lives; indeed, IoT technologies play an essential role in creating a variety of smart applications that \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "35", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Al-Qerem:2023:TSC, author = "Ahmad Al-Qerem and Ali Mohd Ali and Shadi Nashwan and Mohammad Alauthman and Ala Hamarsheh and Ahmad Nabot and Issam Jibreen", title = "Transactional Services for Concurrent Mobile Agents over Edge\slash Cloud Computing-Assisted Social {Internet of Things}", journal = j-JDIQ, volume = "15", number = "3", pages = "36:1--36:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603714", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603714", abstract = "The Web of Things (WoT) is a concept that aims to create a network of intelligent devices capable of remote monitoring, service provisioning, and control. Virtual and Physical Internet of Things (IoT) gateways facilitate communication, processing, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "36", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Al-Qerem:2023:SGM, author = "Ahmad Al-Qerem and Ali Mohd Ali and Hani Attar and Shadi Nashwan and Lianyong Qi and Mohammad Kazem Moghimi and Ahmed Solyman", title = "Synthetic Generation of Multidimensional Data to Improve Classification Model Validity", journal = j-JDIQ, volume = "15", number = "3", pages = "37:1--37:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603715", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603715", abstract = "This article aims to compare Generative Adversarial Network (GAN) models and feature selection methods for generating synthetic data in order to improve the validity of a classification model. The synthetic data generation technique involves generating \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "37", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Alzubi:2023:MDL, author = "Ahmad Alzu'bi and Lojin Bani Younis and Abdelrahman Abuarqoub and Mohammad Hammoudeh", title = "Multimodal Deep Learning with Discriminant Descriptors for Offensive Memes Detection", journal = j-JDIQ, volume = "15", number = "3", pages = "38:1--38:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597308", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597308", abstract = "A meme is a visual representation that illustrates a thought or concept. Memes are spreading steadily among people in this era of rapidly expanding social media platforms, and they are becoming increasingly popular forms of expression. In the domain of \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "38", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Varedi:2023:NFS, author = "Erfan Varedi and Reza Boostani", title = "A Novel Feature Selection Method for Risk Management in High-Dimensional Time Series of Cryptocurrency Market", journal = j-JDIQ, volume = "15", number = "3", pages = "39:1--39:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597309", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Oct 2 15:49:58 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597309", abstract = "In this study, a novel approach for feature selection has been presented in order to overcome the challenge of classifying positive and negative risk prediction in the cryptocurrency market, which contains high fluctuation. This approach is based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "39", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Console:2023:ESI, author = "Marco Console and Maurizio Lenzerini", title = "Editorial: Special Issue on Quality Aspects of Data Preparation", journal = j-JDIQ, volume = "15", number = "4", pages = "40:1--40:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3626461", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3626461", abstract = "This Special Issue of the Journal of Data and Information Quality (JDIQ) contains novel theoretical and methodological contributions as well as state-of-the-art reviews and research perspectives on quality aspects of data preparation. In this editorial, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "40", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Lambrix:2023:CDO, author = "Patrick Lambrix", title = "Completing and Debugging Ontologies: State-of-the-art and Challenges in Repairing Ontologies", journal = j-JDIQ, volume = "15", number = "4", pages = "41:1--41:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597304", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597304", abstract = "As semantically enabled applications require high-quality ontologies, developing and maintaining ontologies that are as correct and complete as possible is an important although difficult task in ontology engineering. A key task is ontology debugging and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "41", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Bono:2023:PDD, author = "Carlo A. Bono and Cinzia Cappiello and Barbara Pernici and Edoardo Ramalli and Monica Vitali", title = "Pipeline Design for Data Preparation for Social Media Analysis", journal = j-JDIQ, volume = "15", number = "4", pages = "42:1--42:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597305", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3597305", abstract = "In a data-driven culture, in which analytics applications are the main resources for supporting decision-making, the use of high-quality datasets is mandatory to minimize errors and risks. For this reason, data analysis tasks need to be preceded by a data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "42", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Krasikov:2023:MSA, author = "Pavel Krasikov and Christine Legner", title = "A Method to Screen, Assess, and Prepare Open Data for Use: a Method to Screen, Assess, and Prepare Open Data for Use", journal = j-JDIQ, volume = "15", number = "4", pages = "43:1--43:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603708", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603708", abstract = "Open data's value-creating capabilities and innovation potential are widely recognized, resulting in a notable increase in the number of published open data sources. A crucial challenge for companies intending to leverage open data is to identify suitable \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "43", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Patel:2023:DCA, author = "Hima Patel and Shanmukha Guttula and Nitin Gupta and Sandeep Hans and Ruhi Sharma Mittal and Lokesh N.", title = "A Data-centric {AI} Framework for Automating Exploratory Data Analysis and Data Quality Tasks", journal = j-JDIQ, volume = "15", number = "4", pages = "44:1--44:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603709", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3603709", abstract = "Democratisation of machine learning (ML) has been an important theme in the research community for the last several years with notable progress made by the model-building community with automated machine learning models. However, data play a central role \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "44", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Vasto-Terrientes:2023:EDM, author = "Luis {Del Vasto-Terrientes}", title = "Experience: Data Management for Delivering {COVID-19} Relief in {Panama}", journal = j-JDIQ, volume = "15", number = "4", pages = "45:1--45:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3623511", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Dec 23 05:24:09 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3623511", abstract = "A data-driven public sector recognizes data as a key element for implementing policies based on evidence. The open data movement has been a major catalyst for elevating data to a privileged position in many governments around the globe. In Panama, open \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "45", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Naumann:2024:E, author = "Felix Naumann", title = "Editorial", journal = j-JDIQ, volume = "16", number = "1", pages = "1:1--1:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650728", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3650728", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Catarci:2024:ECJ, author = "Tiziana Catarci", title = "{Editor}-in-{Chief} (June $ 2017$-November 2023) Farewell Report", journal = j-JDIQ, volume = "16", number = "1", pages = "2:1--2:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3651229", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3651229", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Demartini:2024:ESI, author = "Gianluca Demartini and Shazia Sadiq and Jie Yang", title = "Editorial: Special Issue on Human in the Loop Data Curation", journal = j-JDIQ, volume = "16", number = "1", pages = "3:1--3:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650209", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3650209", abstract = "This Special Issue of the Journal of Data and Information Quality (JDIQ) contains novel theoretical and methodological contributions on data curation involving humans in the loop. In this editorial, we summarize the scope of the issue and briefly describe \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Tsaneva:2024:EHL, author = "Stefani Tsaneva and Marta Sabou", title = "Enhancing Human-in-the-Loop Ontology Curation Results through Task Design", journal = j-JDIQ, volume = "16", number = "1", pages = "4:1--4:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3626960", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3626960", abstract = "The success of artificial intelligence (AI) applications is heavily dependent on the quality of data they rely on. Thus, data curation, dealing with cleaning, organising, and managing data, has become a significant research area to be addressed. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Breuer:2024:VSU, author = "Timo Breuer and Norbert Fuhr and Philipp Schaer", title = "Validating Synthetic Usage Data in Living Lab Environments", journal = j-JDIQ, volume = "16", number = "1", pages = "5:1--5:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3623640", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3623640", abstract = "Evaluating retrieval performance without editorial relevance judgments is challenging, but instead, user interactions can be used as relevance signals. Living labs offer a way for small-scale platforms to validate information retrieval systems with real \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Pereira:2024:CSU, author = "Jo{\~a}o L. M. Pereira and Manuel J. Fonseca and Ant{\'o}nia Lopes and Helena Galhardas", title = "{Cleenex}: Support for User Involvement during an Iterative Data Cleaning Process", journal = j-JDIQ, volume = "16", number = "1", pages = "6:1--6:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3648476", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3648476", abstract = "The existence of large amounts of data increases the probability of occurring data quality problems. A data cleaning process that corrects these problems is usually an iterative process, because it may need to be re-executed and refined to produce high-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Deunf:2024:DQA, author = "Julian {Le Deunf} and Arwa Khannoussi and Laurent Lecornu and Patrick Meyer and John Puentes", title = "Data Quality Assessment through a Preference Model", journal = j-JDIQ, volume = "16", number = "1", pages = "7:1--7:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632407", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3632407", abstract = "Evaluating the quality of data is a problem of a multi-dimensional nature and quite frequently depends on the perspective of an expected use or final purpose of the data. Numerous works have explored the well-known specification of data quality dimensions \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Geeganage:2024:TEG, author = "Dakshi Tharanga Kapugama Geeganage and Moe Thandar Wynn and Arthur H. M. ter Hofstede", title = "{Text2EL+}: Expert Guided Event Log Enrichment Using Unstructured Text", journal = j-JDIQ, volume = "16", number = "1", pages = "8:1--8:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640018", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3640018", abstract = "Through the application of process mining, business processes can be improved on the basis of process execution data captured in event logs. Naturally, the quality of this data determines the quality of the improvement recommendations. Improving data \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Backes:2024:CCS, author = "Tobias Backes and Stefan Dietze", title = "Connected Components for Scaling Partial-order Blocking to Billion Entities", journal = j-JDIQ, volume = "16", number = "1", pages = "9:1--9:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3646553", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3646553", abstract = "In entity resolution, blocking pre-partitions data for further processing by more expensive methods. Two entity mentions are in the same block if they share identical or related blocking-keys. Previous work has sometimes related blocking keys by grouping \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Richard:2024:AEA, author = "Guy-Junior Richard and J{\'e}r{\^o}me Habonneau and Didier Gu{\'e}riot and Jean-Marc {Le Caillec}", title = "{AI} Explainability and Acceptance: a Case Study for Underwater Mine Hunting", journal = j-JDIQ, volume = "16", number = "1", pages = "10:1--10:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3635113", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Mon Mar 25 11:29:07 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3635113", abstract = "In critical operational context such as Mine Warfare, Automatic Target Recognition (ATR) algorithms are still hardly accepted. The complexity of their decision-making hampers understanding of predictions despite performances approaching human expert ones. \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Li:2024:ALD, author = "Na Li and Yiyang Qi and Chaoran Li and Zhiming Zhao", title = "Active Learning for Data Quality Control: a Survey", journal = j-JDIQ, volume = "16", number = "2", pages = "11:1--11:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663369", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jun 27 06:15:46 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3663369", abstract = "Data quality plays a vital role in scientific research and decision-making across industries. Thus, it is crucial to incorporate the data quality control (DQC) process, which comprises various actions and operations to detect and correct data errors. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Mecca:2024:BLR, author = "Giansalvatore Mecca and Paolo Papotti and Donatello Santoro and Enzo Veltri", title = "{BUNNI}: Learning Repair Actions in Rule-driven Data Cleaning", journal = j-JDIQ, volume = "16", number = "2", pages = "12:1--12:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3665930", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jun 27 06:15:46 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3665930", abstract = "In this work, we address the challenging and open problem of involving non-expert users in the data repairing problem as first-class citizens. Despite a large number of proposals that have been devoted to cleaning data from the point of view of expert \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Bachinger:2024:DVU, author = "Florian Bachinger and Lisa Ehrlinger and Gabriel Kronberger and Wolfram W{\"o}ss", title = "Data Validation Utilizing Expert Knowledge and Shape Constraints", journal = j-JDIQ, volume = "16", number = "2", pages = "13:1--13:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661826", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jun 27 06:15:46 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3661826", abstract = "Data validation is a primary concern in any data-driven application, as undetected data errors may negatively affect machine learning models and lead to suboptimal decisions. Data quality issues are usually detected manually by experts, which becomes \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Stenger:2024:TCS, author = "Michael Stenger and Andr{\'e} Bauer and Thomas Prantl and Robert Leppich and Nathaniel Hudson and Kyle Chard and Ian Foster and Samuel Kounev", title = "Thinking in Categories: a Survey on Assessing the Quality for Time Series Synthesis", journal = j-JDIQ, volume = "16", number = "2", pages = "14:1--14:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3666006", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Jun 27 06:15:46 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3666006", abstract = "Time series data are widely used and provide a wealth of information for countless applications. However, some applications are faced with a limited amount of data, or the data cannot be used due to confidentiality concerns. To overcome these obstacles, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Chuprov:2024:DQB, author = "Sergei Chuprov and Raman Zatsarenko and Leon Reznik and Igor Khokhlov", title = "Data Quality Based Intelligent Instrument Selection with Security Integration", journal = j-JDIQ, volume = "16", number = "3", pages = "15:1--15:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695770", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Oct 10 06:13:03 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3695770", abstract = "We propose a novel Data Quality with Security (DQS) integrated instrumentation selection approach that facilitates aggregation of multi-modal data from heterogeneous sources. As our major contribution, we develop a framework that incorporates multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Belgacem:2024:AAD, author = "Hichem Belgacem and Xiaochen Li and Domenico Bianculli and Lionel Briand", title = "Automated anomaly detection for categorical data by repurposing a form filling recommender system", journal = j-JDIQ, volume = "16", number = "3", pages = "16:1--16:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3696110", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Oct 10 06:13:03 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3696110", abstract = "Data quality is crucial in modern software systems, like data-driven decision support systems. However, data quality is affected by data anomalies, which represent instances that deviate from most of the data. These anomalies affect the reliability and \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Peters:2024:GEM, author = "Heinrich Peters and Alireza Hashemi and James Rae", title = "Generalizable Error Modeling for Human Data Annotation: Evidence From an Industry-Scale Search Data Annotation Program", journal = j-JDIQ, volume = "16", number = "3", pages = "17:1--17:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688394", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Oct 10 06:13:03 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3688394", abstract = "Machine learning (ML) and artificial intelligence (AI) systems rely heavily on human-annotated data for training and evaluation. A major challenge in this context is the occurrence of annotation errors, as their effects can degrade model performance. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Alzahrani:2024:ECA, author = "Naif Alzahrani and Jacek Ca{\l}a and Paolo Missier", title = "Experience: a Comparative Analysis of Multivariate Time-Series Generative Models: a Case Study on Human Activity Data", journal = j-JDIQ, volume = "16", number = "3", pages = "18:1--18:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688393", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Oct 10 06:13:03 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3688393", abstract = "Human activity recognition (HAR) is an active research field that has seen great success in recent years due to advances in sensory data collection methods and activity recognition systems. Deep artificial intelligence (AI) models have contributed to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Serra:2024:UCD, author = "Flavia Serra and Ver{\'o}nika Peralta and Adriana Marotta and Patrick Marcel", title = "Use of Context in Data Quality Management: a Systematic Literature Review", journal = j-JDIQ, volume = "16", number = "3", pages = "19:1--19:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672082", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Thu Oct 10 06:13:03 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3672082", abstract = "The importance of context in data quality (DQ) was shown many years ago and nowadays is widely accepted. Early approaches and surveys defined DQ as fitness for use and showed the influence of context on DQ. This article presents a Systematic Literature \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Khomh:2024:ESI, author = "Foutse Khomh and Andreas Metzger and Phu Nguyen and Sagar Sen", title = "Editorial: Special Issue on Software Engineering and {AI} for Data Quality", journal = j-JDIQ, volume = "16", number = "4", pages = "20:1--20:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3708503", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3708503", abstract = "This editorial summarizes the content of the Special Issue on Software Engineering and AI for Data Quality of the Journal of Data and Information Quality (JDIQ).", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "20", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Golendukhina:2024:CCI, author = "Valentina Golendukhina and Harald Foidl and Daniel H{\"o}rl and Michael Felderer", title = "A Catalog of Consumer {IoT} Device Characteristics for Data Quality Estimation", journal = j-JDIQ, volume = "16", number = "4", pages = "21:1--21:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639708", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3639708", abstract = "The Internet of Things (IoT) is rapidly growing and spreading across different markets, including the customer market and consumer IoT (CIoT). The large variety of gadgets and their availability makes CIoT more and more influential, especially in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "21", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Begoli:2024:CDP, author = "Edmon Begoli and Maria Mahbub and Linsey Passarella and Sudarshan Srinivasan", title = "A Compound Data Poisoning Technique with Significant Adversarial Effects on Transformer-based Sentiment Classification Tasks", journal = j-JDIQ, volume = "16", number = "4", pages = "22:1--22:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3705897", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3705897", abstract = "Transformer-based models have demonstrated much success in various natural language processing tasks. However, they are often vulnerable to adversarial attacks, such as data poisoning, which can intentionally fool the model into generating incorrect \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "22", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Valeriano:2024:UPM, author = "Maria Gabriela Valeriano and Ana Matran-Fernandez and Carlos Kiffer and Ana Carolina Lorena", title = "Understanding the performance of machine learning models from data- to patient-level", journal = j-JDIQ, volume = "16", number = "4", pages = "23:1--23:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3687267", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3687267", abstract = "Machine Learning (ML) models have the potential to support decision-making in healthcare by grasping complex patterns within data. However, decisions in this domain are sensitive and require active involvement of domain specialists with deep knowledge of \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "23", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Jesus:2024:UAE, author = "Rui Filipe Ribeiro Jesus and Ana Rodrigues and Carlos Costa", title = "Unlocking {AutoML}: Enhancing Data with Deep Learning Algorithms for Medical Imaging", journal = j-JDIQ, volume = "16", number = "4", pages = "24:1--24:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3705896", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3705896", abstract = "Deep learning algorithms have become increasingly popular over the years, having proved their efficiency in input-output functions for distinct types of data. This technology is particularly useful in medical imaging, where complex image structures often \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "24", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Truong:2024:TPR, author = "Hong-Linh Truong and Ngoc Nhu Trang Nguyen", title = "{TENSAI} --- Practical and Responsible Observability for Data Quality-aware Large-scale Analytics", journal = j-JDIQ, volume = "16", number = "4", pages = "25:1--25:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3708014", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Tue Dec 24 06:42:37 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3708014", abstract = "Given a large-scale mobile network with a variety of equipment and radio access network technologies for an approximate 20 million subscribers, there are many types of data that can be used for big data analytics and machine learning (ML) tasks for \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "25", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Hamed:2025:COD, author = "Naeima Hamed and Omer Rana and Pablo Orozco-terWengel and Beno{\^\i}t Goossens and Charith Perera", title = "A Comparison of Open Data Observatories", journal = j-JDIQ, volume = "17", number = "1", pages = "1:1--1:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705863", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Mar 29 08:13:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3705863", abstract = "Open Data Observatories refer to online platforms that provide real-time and historical data for a particular application context, e.g., urban/non-urban environments or a specific application domain. They are generally developed to facilitate \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "1", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Boeckling:2025:CDS, author = "Toon Boeckling and Antoon Bronselaer", title = "Cleaning data with {Swipe}", journal = j-JDIQ, volume = "17", number = "1", pages = "2:1--2:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712205", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Mar 29 08:13:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3712205", abstract = "The repair problem for functional dependencies is the problem where an input database needs to be modified such that all functional dependencies are satisfied and the difference with the original database is minimal. The output database is then called a minimal-cost repair. If the allowed modifications are value updates, then finding a minimal-cost repair is NP-hard. A well-known approach to find approximations of minimal-cost repairs builds a Chase tree in which each internal node resolves violations of one functional dependency and leaf nodes represent repairs. A key property of this approach is that controlling the branching factor of the Chase tree allows to control the tradeoff between repair quality and computational efficiency. In this article, we explore an extreme variant of this idea in which the Chase tree has only one path. To construct this path, we first create an ordered partition of attributes (i.e., a partition of which the classes are totally ordered) such that classes can be repaired sequentially. We repair each class only once and do so by fixing the order in which dependencies are repaired. This principle is called priority repairing, and we provide a simple heuristic to determine priority. The techniques for attribute partitioning and priority repair are combined in an algorithm called Swipe. An empirical study on four real-life datasets shows that Swipe is one to three orders of magnitude faster than Llunatic and HoloClean, whereas the quality of repairs is comparable or better. A scalability analysis shows that Swipe scales linearly for an increasing number of tuples and quadratically for an increasing number of FDs.", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "2", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Videsjorden:2025:DEP, author = "Adela Nedisan Videsjorden and Arda Goknil and Sagar Sen and Erik Johannes Husom and Phu Nguyen", title = "{3D-DaVa}: Enhancing {3D} Point Cloud Data Reliability for Industrial Applications", journal = j-JDIQ, volume = "17", number = "1", pages = "3:1--3:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711817", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Mar 29 08:13:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3711817", abstract = "The escalating incorporation of three-dimensional (3D) point cloud data across industrial applications highlights the necessity of assuring its reliability. The error-prone process of object digitization, the large data volumes, and equipment inaccuracies \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "3", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Choudhury:2025:QAF, author = "Bismita Choudhury and En-Tni Lin and Jacqueline Speir", title = "A Quantitative Approach for Forensic Footwear Quality Assessment using Machine and Deep Learning", journal = j-JDIQ, volume = "17", number = "1", pages = "4:1--4:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716634", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Mar 29 08:13:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3716634", abstract = "Forensic footwear impressions play a crucial role in criminal investigations, assisting in possible suspect identification. The quality of an impression collected from a crime scene directly impacts the forensic information that can be garnered from any \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "4", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Nanayakkara:2025:UEE, author = "Charini Nanayakkara and Peter Christen and Victor Christen", title = "Unsupervised Evaluation of Entity Resolution", journal = j-JDIQ, volume = "17", number = "1", pages = "5:1--5:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721985", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Sat Mar 29 08:13:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", URL = "https://dl.acm.org/doi/10.1145/3721985", abstract = "Entity resolution is the problem of identifying records that refer to the same entity from one or multiple databases. Applications of entity resolution range from health and social science research to national security and online commerce. Entity \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "5", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Comuzzi:2025:LMS, author = "Marco Comuzzi and Jonghyeon Ko and Fabrizio Maggi", title = "A Language to Model and Simulate Data Quality Issues in Process Mining", journal = j-JDIQ, volume = "17", number = "2", pages = "6:1--6:36", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743144", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Real-life business process event logs may suffer from significant data quality problems negatively influencing process mining analysis. Over time, a range of approaches has been developed to detect and repair these quality problems. Validation of these \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "6", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Christen:2025:GMD, author = "Victor Christen and Daniel Obraczka and Marvin Hofer and Martin Franke and Erhard Rahm", title = "Graph Metrics-driven Record Cluster Repair meets {LLM}-based active learning", journal = j-JDIQ, volume = "17", number = "2", pages = "7:1--7:25", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3735511", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Entity resolution plays an important role in data integration. However, most entity resolution methods focus on pairwise linkage and ignore potential errors generated by the transitive closure based on the determined equality links between two or more \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "7", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Klier:2025:ABA, author = "Mathias Klier and Andreas Obermeier and Christian Sparn and Torben Widmann", title = "Anomaly-based Assessment of Semantic Consistency: Design and Evaluation of a Novel Probability-based Metric in Cooperation with a {German} Car Manufacturer", journal = j-JDIQ, volume = "17", number = "2", pages = "8:1--8:24", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3732783", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "In the era of big data, the rate at which data is produced and analyzed is increasing steadily. However, the occurrence of conflicting data can significantly deteriorate the quality of the analyses and diminish the potential benefits of big data. Thus, \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "8", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Rondina:2025:EBD, author = "Marco Rondina and Antonio Vetr{\`o} and Alessandro Fabris and Gianmaria Silvello and Gian Antonio Susto and Marco Torchiano and Juan Carlos {De Martin}", title = "Experience: Bridging Data Measurement and Ethical Challenges with Extended Data Briefs", journal = j-JDIQ, volume = "17", number = "2", pages = "9:1--9:22", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3726872", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "To promote the responsible development and use of data-driven technologies-such as machine learning and artificial intelligence-principles of trustworthiness, accountability, and fairness should be followed. The quality of the dataset on which these \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "9", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Ebiele:2025:QDV, author = "Malick Ebiele and Malika Bendechache and Rob Brennan", title = "Quantitative Data Valuation Methods: a Systematic Review and Taxonomy", journal = j-JDIQ, volume = "17", number = "2", pages = "10:1--10:39", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736178", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:41 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Background: Data valuation is an area of study encompassing but not limited to data quality, machine learning, applied energy, and information economics. The primary focus of data valuation research is the development of methodologies for determining the \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "10", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wan:2025:ESI, author = "Shaohua Wan and Carmen Bisogni and Marco Zappatore and Manoranjan Paul", title = "Editorial: Special Issue on Advanced Artificial Intelligence Technologies for Multimedia Big Data Quality", journal = j-JDIQ, volume = "17", number = "3", pages = "11:1--11:4", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769264", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "This editorial summarizes the content of the Special Issue on Advanced Artificial Intelligence Technologies for Multimedia Big Data Quality of the Journal of Data and Information Quality (JDIQ).", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "11", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Al-Ameri:2025:SAS, author = "Abdullah Al-Ameri and Waleed Al-Shammari and Aniello Castiglione and Michele Nappi and Chiara Pero and Muhammad Umer", title = "Student Academic Success Prediction Using Learning Management Multimedia Data With Convoluted Features and Ensemble Model", journal = j-JDIQ, volume = "17", number = "3", pages = "12:1--12:16", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3687268", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Predicting students' academic success is crucial for educational institutions to provide targeted support and interventions to those at risk of underperforming. With the increasing adoption of digital learning management systems (LMS), there has been a \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "12", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wu:2025:HPR, author = "Zongda Wu and Guoqi Lin and Huawen Liu and Jian Xie and Guandong Xu and Enhong Chen and Gang Li", title = "How to Protect of Reader Preference Privacy in Mobile Book Information Services: a Technical Method", journal = j-JDIQ, volume = "17", number = "3", pages = "13:1--13:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3688395", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Along with the wide popularization of mobile terminal devices such as smart phones, mobile libraries, which provide mobile book information services as the core business, have become an important part of people's daily life. However, with the rapid \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "13", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Tan:2025:EEL, author = "Kehui Tan and Jiayang Yao and Tianqi Pang and Chenyou Fan and Yu Song", title = "{ELF}: Educational {LLM} Framework of Improving and Evaluating {AI}-generated Content for Classroom Teaching", journal = j-JDIQ, volume = "17", number = "3", pages = "14:1--14:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712065", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Recent studies [ 48 , 72 ] have demonstrated that Large Language Models (LLMs), like ChatGPT [3, 46] and LLAMA [59], can assist with routine teaching tasks and have the potential to revolutionize traditional education. However, other studies [35] highlight \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "14", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Xu:2025:OAB, author = "Honghui Xu and Zhipeng Cai and Liran Ma and Yingshu Li and Daehee Seo and Wei Li", title = "Overheard: Audio-based Integral Event Inference", journal = j-JDIQ, volume = "17", number = "3", pages = "15:1--15:17", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3695771", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "There is no doubt that the popularity of smart devices and the development of deep learning models bring individuals too much convenience. However, some rancorous attackers can also implement unexpected privacy inferences on sensed data from smart devices \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "15", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Lou:2025:DED, author = "Jungang Lou and Xuhong Wu and Kang Zhao and Qing Shen and Jinnan Yang", title = "{DUTNG}: Employing Dynamically Updating Traffic Network Graph for Short-term Traffic Flow Prediction", journal = j-JDIQ, volume = "17", number = "3", pages = "16:1--16:18", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3712066", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The complex topology of actual road networks and the interlinked nature of traffic flow with spatial --- temporal factors pose challenges to traditional node-static correlation models. Therefore, the DUTNG model-a short-term traffic flow prediction model \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "16", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Wang:2025:ANA, author = "Xiaodong Wang and Longyun Qi and Xingshen Wei and Weiping Zhu and Haitao Jiang and Zhitao Guan", title = "{AED}: a Novel Approach for Intrusion Detection without Abnormal Samples in Big Data Environment", journal = j-JDIQ, volume = "17", number = "3", pages = "17:1--17:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3695879", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "The rapid advance of multimedia devices, including sensors, cameras, and mobile phones, has given rise to the prevalence of Internet of Multimedia Things (IoMT), generating huge volumes of application-oriented multimedia data. At the same time, network \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "17", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Zheng:2025:LAA, author = "Yanwei Zheng and Yaling Li and Changrui Li and Taiqi Zhang and Yifei Zou and Dongxiao Yu", title = "Learning Attribute Attention and Retrospect Location for Instance Object Navigation", journal = j-JDIQ, volume = "17", number = "3", pages = "18:1--18:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3706423", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "Visual object navigation, a classical problem in embodied intelligence tasks, requires agents to find a specified target using only the first-person view of visual information. A number of methods that search only for an object in the same category are \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "18", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", } @Article{Liu:2025:RSI, author = "Xinfu Liu and Benze Wu and Yirui Wu", title = "A Remote Sensing Image Classification Method Based on Detail Attention Sampling and Teacher-Student Network", journal = j-JDIQ, volume = "17", number = "3", pages = "19:1--19:19", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721984", ISSN = "1936-1955", ISSN-L = "1936-1955", bibdate = "Wed Oct 8 06:42:42 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/jdiq.bib", abstract = "With the continuous development of remote sensing technology, the data volume of high-resolution is increasing with the large coverage of high-resolution remote sensing images, changeable objects, and complex backgrounds. However, the sensitivity field of \ldots{}", acknowledgement = ack-nhfb, ajournal = "J. Data Inf. Qual.", articleno = "19", fjournal = "Journal of Data and Information Quality (JDIQ)", journal-URL = "https://dl.acm.org/loi/jdiq", }