%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.50", %%% date = "14 October 2017", %%% time = "10:26:28 MDT", %%% filename = "talip.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "52522 8478 41180 402838", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "Asian language information processing, %%% bibliography, BibTeX, TALIP", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Asian language %%% information processing (TALIP) (CODEN none, %%% ISSN 1530-0226 (print), 1558-3430 %%% (electronic)), which began publishing in %%% March 2002. %%% %%% Publication ceased with volume 13, number 4, %%% in 2014 when the journal was renamed to ACM %%% Transactions on Asian and Low-Resource %%% Language Information Processing (TALLIP). %%% The new journal is covered in a separate %%% bibliography, tallip.bib. %%% %%% The journal has a World Wide Web site at %%% %%% http://www.acm.org/pubs/talip/ %%% http://portal.acm.org/browse_dl.cfm?&idx=J820 %%% %%% At version 1.50, the year coverage looked %%% like this: %%% %%% 2002 ( 15) 2007 ( 14) 2012 ( 18) %%% 2003 ( 22) 2008 ( 13) 2013 ( 17) %%% 2004 ( 17) 2009 ( 19) 2014 ( 18) %%% 2005 ( 17) 2010 ( 15) %%% 2006 ( 28) 2011 ( 21) %%% %%% Article: 234 %%% %%% Total entries: 234 %%% %%% This bibliography has been constructed %%% primarily from the publisher Web site. %%% %%% Numerous errors in the sources noted above %%% have been corrected. Spelling has been %%% verified with the UNIX spell and GNU ispell %%% programs using the exception dictionary %%% stored in the companion file with extension %%% .sok. %%% %%% BibTeX citation tags are uniformly chosen as %%% name:year:abbrev, where name is the family %%% name of the first author or editor, year is a %%% 4-digit number, and abbrev is a 3-letter %%% condensation of important title words. %%% Citation labels were automatically generated %%% by software developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, with the help of %%% ``bibsort -byvolume''. The bibsort utility %%% is available from ftp.math.utah.edu in %%% /pub/tex/bib. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\hyphenation{ }" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TALIP = "ACM Transactions on Asian Language Information Processing"} %%% ==================================================================== %%% Bibliography entries: @Article{Wong:2002:P, author = "Kam-Fai Wong and Jun'ichi Tsujii", title = "Prologue", journal = j-TALIP, volume = "1", number = "1", pages = "1--2", month = mar, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Gao:2002:TUA, author = "Jianfeng Gao and Joshua Goodman and Mingjing Li and Kai-Fu Lee", title = "Toward a unified approach to statistical language modeling for {Chinese}", journal = j-TALIP, volume = "1", number = "1", pages = "3--33", month = mar, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/509900.509903", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lai:2002:MTE, author = "Yu-Sheng Lai and Chung-Hsien Wu", title = "Meaningful term extraction and discriminative term selection in text categorization via unknown-word methodology", journal = j-TALIP, volume = "1", number = "1", pages = "34--64", month = mar, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/509900.509904", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2002:MBG, author = "Byeongchang Kim and Gary Geunbae Lee and Jong-Hyeok Lee", title = "Morpheme-based grapheme to phoneme conversion using phonetic patterns and morphophonemic connectivity information", journal = j-TALIP, volume = "1", number = "1", pages = "65--82", month = mar, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lee:2002:UTI, author = "Tan Lee and Wai Lau and Y. W. Wong and P. C. Ching", title = "Using tone information in {Cantonese} continuous speech recognition", journal = j-TALIP, volume = "1", number = "1", pages = "83--102", month = mar, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/509900.509906", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chen:2002:BCE, author = "Hsin-Hsi Chen and Chi-Ching Lin and Wen-Cheng Lin", title = "Building a {Chinese--English} wordnet for translingual applications", journal = j-TALIP, volume = "1", number = "2", pages = "103--122", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/568954.568955", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Meng:2002:GPM, author = "Helen Meng and Po-Chui Luk and Kui Xu and Fuliang Weng", title = "{GLR} parsing with multiple grammars for natural language queries", journal = j-TALIP, volume = "1", number = "2", pages = "123--144", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/568954.568956", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Murata:2002:CTM, author = "Masaki Murata and Qing Ma and Hitoshi Isahara", title = "Comparison of three machine-learning methods for {Thai} part-of-speech tagging", journal = j-TALIP, volume = "1", number = "2", pages = "145--158", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/568954.568957", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lu:2002:TWQ, author = "Wen-Hsiang Lu and Lee-Feng Chien and Hsi-Jian Lee", title = "Translation of {Web} queries using anchor text mining", journal = j-TALIP, volume = "1", number = "2", pages = "159--172", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/568954.568958", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2002:WBA, author = "Wenjie Li and Kam-Fai Wong", title = "A word-based approach for modeling and discovering temporal relations embedded in {Chinese} sentences", journal = j-TALIP, volume = "1", number = "3", pages = "173--206", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lee:2002:ACB, author = "Jin-Seok Lee and Byeongchang Kim and Gary Geunbae Lee", title = "Automatic corpus-based tone and break-index prediction using {K-ToBI} representation", journal = j-TALIP, volume = "1", number = "3", pages = "207--224", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Luk:2002:CCD, author = "Robert W. P. Luk and K. L. Kwok", title = "A comparison of {Chinese} document indexing strategies and retrieval models", journal = j-TALIP, volume = "1", number = "3", pages = "225--268", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Suzuki:2002:LCS, author = "Izumi Suzuki and Yoshiki Mikami and Ario Ohsato and Yoshihide Chubachi", title = "A language and character set determination method based on {N}-gram statistics", journal = j-TALIP, volume = "1", number = "3", pages = "269--278", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Jin:2002:CDC, author = "Honglan Jin and Kam-Fai Wong", title = "A {Chinese} dictionary construction algorithm for information retrieval", journal = j-TALIP, volume = "1", number = "4", pages = "281--296", month = dec, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:01 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2002:CCB, author = "Yuanxiang Li and Xiaoqing Ding and Chew Lim Tan", title = "Combining character-based bigrams with word-based bigrams in contextual postprocessing for {Chinese} script recognition", journal = j-TALIP, volume = "1", number = "4", pages = "297--309", month = dec, year = "2002", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Aug 7 08:49:01 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lo:2003:CLS, author = "Wai-Kit Lo and Helen Meng and P. C. Ching", title = "Cross-language spoken document retrieval using {HMM}-based retrieval model with multi-scale fusion", journal = j-TALIP, volume = "2", number = "1", pages = "1--26", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Shi:2003:OHC, author = "Daming Shi and Robert I. Damper and Steve R. Gunn", title = "Offline handwritten {Chinese} character recognition by radical decomposition", journal = j-TALIP, volume = "2", number = "1", pages = "27--48", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lee:2003:TAS, author = "Yue-Shi Lee", title = "Task adaptation in stochastic language model for {Chinese} homophone disambiguation", journal = j-TALIP, volume = "2", number = "1", pages = "49--62", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Shieh:2003:EAT, author = "Jiann-Cherng Shieh", title = "An efficient accessing technique for {Taiwanese} phonetic transcriptions", journal = j-TALIP, volume = "2", number = "1", pages = "63--77", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Oard:2003:SLE, author = "Douglas W. Oard", title = "The surprise language exercises", journal = j-TALIP, volume = "2", number = "2", pages = "79--84", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Allan:2003:MTD, author = "James Allan and Victor Lavrenko and Margaret E. Connell", title = "A month to topic detection and tracking in {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "85--100", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Strassel:2003:LRC, author = "Stephanie Strassel and Mike Maxwell and Christopher Cieri", title = "Linguistic resource creation for research and technology development: a recent experiment", journal = j-TALIP, volume = "2", number = "2", pages = "101--117", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Dorr:2003:RPD, author = "Bonnie J. Dorr and Necip Fazil Ayan and Nizar Habash and Nitin Madnani and Rebecca Hwa", title = "Rapid porting of {DUSTer} to {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "118--123", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Huang:2003:ENE, author = "Fei Huang and Stephan Vogel and Alex Waibel", title = "Extracting named entity translingual equivalence with limited resources", journal = j-TALIP, volume = "2", number = "2", pages = "124--129", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Larkey:2003:HCT, author = "Leah S. Larkey and Margaret E. Connell and Nasreen Abduljaleel", title = "{Hindi CLIR} in thirty days", journal = j-TALIP, volume = "2", number = "2", pages = "130--142", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lavie:2003:EHE, author = "Alon Lavie and Stephan Vogel and Lori Levin and Erik Peterson and Katharina Probst and Ariadna Font Llitj{\'o}s and Rachel Reynolds and Jaime Carbonell and Richard Cohen", title = "Experiments with a {Hindi-to-English} transfer-based {MT} system under a miserly data scenario", journal = j-TALIP, volume = "2", number = "2", pages = "143--163", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Xu:2003:CLR, author = "Jinxi Xu and Ralph Weischedel", title = "Cross-lingual retrieval for {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "164--168", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{May:2003:SWC, author = "Jonathan May and Ada Brunstein and Prem Natarajan and Ralph Weischedel", title = "Surprise! {What}'s in a {Cebuano} or {Hindi Name?}", journal = j-TALIP, volume = "2", number = "3", pages = "169--180", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sekine:2003:HEC, author = "Satoshi Sekine and Ralph Grishman", title = "{Hindi-English} cross-lingual question-answering system", journal = j-TALIP, volume = "2", number = "3", pages = "181--192", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Ma:2003:AHO, author = "Huanfeng Ma and David Doermann", title = "Adaptive {Hindi OCR} using generalized {Hausdorff} image comparison", journal = j-TALIP, volume = "2", number = "3", pages = "193--218", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{He:2003:MMI, author = "Daqing He and Douglas W. Oard and Jianqiang Wang and Jun Luo and Dina Demner-Fushman and Kareem Darwish and Philip Resnik and Sanjeev Khudanpur and Michael Nossal and Michael Subotin and Anton Leuski", title = "Making {MIRACLEs}: {Interactive} translingual search for {Cebuano} and {Hindi}", journal = j-TALIP, volume = "2", number = "3", pages = "219--244", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Leuski:2003:CLC, author = "Anton Leuski and Chin-Yew Lin and Liang Zhou and Ulrich Germann and Franz Josef Och and Eduard Hovy", title = "Cross-lingual {C*ST*RD}: {English} access to {Hindi} information", journal = j-TALIP, volume = "2", number = "3", pages = "245--269", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Dorr:2003:CLH, author = "Bonnie Dorr and David Zajic and Richard Schwartz", title = "Cross-language headline generation for {Hindi}", journal = j-TALIP, volume = "2", number = "3", pages = "270--289", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2003:RDH, author = "Wei Li and Andrew McCallum", title = "Rapid development of {Hindi} named entity recognition using conditional random fields and feature induction", journal = j-TALIP, volume = "2", number = "3", pages = "290--294", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Maynard:2003:RCI, author = "Diana Maynard and Valentin Tablan and Kalina Bontcheva and Hamish Cunningham", title = "Rapid customization of an information extraction system for a surprise language", journal = j-TALIP, volume = "2", number = "3", pages = "295--300", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kang:2003:IPP, author = "Mi-Young Kang and Aesun Yoon and Hyuk-Chul Kwon", title = "Improving partial parsing based on error-pattern analysis for a {Korean} grammar-checker", journal = j-TALIP, volume = "2", number = "4", pages = "301--323", month = dec, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2003:RRE, author = "Harksoo Kim and Jungyun Seo", title = "Resolution of referring expressions in a {Korean} multimodal dialogue system", journal = j-TALIP, volume = "2", number = "4", pages = "324--337", month = dec, year = "2003", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mani:2004:ISI, author = "Inderjeet Mani and James Pustejovsky and Beth Sundheim", title = "Introduction to the special issue on temporal information processing", journal = j-TALIP, volume = "3", number = "1", pages = "1--10", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Han:2004:FRT, author = "Benjamin Han and Alon Lavie", title = "A framework for resolution of time in natural language", journal = j-TALIP, volume = "3", number = "1", pages = "11--32", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Schilder:2004:EMT, author = "Frank Schilder", title = "Extracting meaning from temporal nouns and temporal prepositions", journal = j-TALIP, volume = "3", number = "1", pages = "33--50", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Jang:2004:ATT, author = "Seok Bae Jang and Jennifer Baldwin and Inderjeet Mani", title = "Automatic {TIMEX2} tagging of {Korean} news", journal = j-TALIP, volume = "3", number = "1", pages = "51--65", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Hobbs:2004:OTS, author = "Jerry R. Hobbs and Feng Pan", title = "An ontology of time for the {Semantic Web}", journal = j-TALIP, volume = "3", number = "1", pages = "66--85", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Gao:2004:ISI, author = "Jianfeng Gao and Chin-Yew Lin", title = "Introduction to the special issue on statistical language modeling", journal = j-TALIP, volume = "3", number = "2", pages = "87--93", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2004:LTL, author = "Woosung Kim and Sanjeev Khudanpur", title = "Lexical triggers and latent semantic analysis for cross-lingual language model adaptation", journal = j-TALIP, volume = "3", number = "2", pages = "94--112", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Linares:2004:HLM, author = "Diego Linares and Jos{\'e}-Miguel Bened{\'\i} and Joan-Andreu S{\'a}nchez", title = "A hybrid language model based on a combination of {$N$}-grams and stochastic context-free grammars", journal = j-TALIP, volume = "3", number = "2", pages = "113--127", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chen:2004:DHG, author = "Berlin Chen and Hsin-Min Wang and Lin-Shan Lee", title = "A discriminative {HMM\slash N}-gram-based retrieval approach for {Mandarin} spoken documents", journal = j-TALIP, volume = "3", number = "2", pages = "128--145", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Nguyen:2004:EBS, author = "Minh Le Nguyen and Susumu Horiguchi and Akira Shimazu and Bao Tu Ho", title = "Example-based sentence reduction using the hidden {Markov} model", journal = j-TALIP, volume = "3", number = "2", pages = "146--158", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Fung:2004:MEC, author = "Pascale Fung and Grace Ngai and Yongsheng Yang and Benfeng Chen", title = "A maximum-entropy {Chinese} parser augmented by transformation-based learning", journal = j-TALIP, volume = "3", number = "2", pages = "159--168", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2004:AMF, author = "Yujia Li and Tan Lee and Yao Qian", title = "Analysis and modeling of {F0} contours for {Cantonese} text-to-speech", journal = j-TALIP, volume = "3", number = "3", pages = "169--180", month = sep, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Huang:2004:UWB, author = "Chien-Chung Huang and Shui-Lung Chuang and Lee-Feng Chien", title = "Using a {Web}-based categorization approach to generate thematic metadata from texts", journal = j-TALIP, volume = "3", number = "3", pages = "190--212", month = sep, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Myaeng:2004:ISI, author = "Sung Hyon Myaeng", title = "Introduction to the special issue on computer processing of oriental languages", journal = j-TALIP, volume = "3", number = "4", pages = "213--213", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Baoli:2004:ANN, author = "Li Baoli and Lu Qin and Yu Shiwen", title = "An adaptive $k$-nearest neighbor text categorization strategy", journal = j-TALIP, volume = "3", number = "4", pages = "215--226", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2004:UTI, author = "Pyung Kim and Sung Hyon Myaeng", title = "Usefulness of temporal information automatically extracted from news articles for topic tracking", journal = j-TALIP, volume = "3", number = "4", pages = "227--242", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhang:2004:ESS, author = "Le Zhang and Jingbo Zhu and Tianshun Yao", title = "An evaluation of statistical spam filtering techniques", journal = j-TALIP, volume = "3", number = "4", pages = "243--269", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wu:2005:DSF, author = "Chung-Hsien Wu and Jui-Feng Yeh and Ming-Jun Chen", title = "Domain-specific {FAQ} retrieval using independent aspects", journal = j-TALIP, volume = "4", number = "1", pages = "1--17", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Murata:2005:CEV, author = "Masaki Murata and Masao Utiyama and Kiyotaka Uchimoto and Hitoshi Isahara and Qing Ma", title = "Correction of errors in a verb modality corpus for machine translation with a machine-learning method", journal = j-TALIP, volume = "4", number = "1", pages = "18--37", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Hendessi:2005:SSP, author = "F. Hendessi and A. Ghayoori and T. A. Gulliver", title = "A speech synthesizer for {Persian} text using a neural network with a smooth ergodic {HMM}", journal = j-TALIP, volume = "4", number = "1", pages = "38--52", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhang:2005:COT, author = "Ying Zhang and Phil Vines and Justin Zobel", title = "{Chinese} {OOV} translation and post-translation query expansion in {Chinese--English} cross-lingual information retrieval", journal = j-TALIP, volume = "4", number = "2", pages = "57--77", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Qu:2005:TES, author = "Yan Qu and David A. Hull and Gregory Grefenstette and David A. Evans and Motoko Ishikawa and Setsuko Nara and Toshiya Ueda and Daisuke Noda and Kousaku Arita and Yuki Funakoshi and Hiroshi Matsuda", title = "Towards effective strategies for monolingual and bilingual information retrieval: {Lessons} learned from {NTCIR-4}", journal = j-TALIP, volume = "4", number = "2", pages = "78--110", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sakai:2005:FPR, author = "Tetsuya Sakai and Toshihiko Manabe and Makoto Koyama", title = "Flexible pseudo-relevance feedback via selective sampling", journal = j-TALIP, volume = "4", number = "2", pages = "111--135", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kwok:2005:RRP, author = "Kui Lam Kwok and Sora Choi and Norbert Dinstl", title = "Rich results from poor resources: {NTCIR-4} monolingual and cross-lingual retrieval of {Korean} texts using {Chinese} and {English}", journal = j-TALIP, volume = "4", number = "2", pages = "135--158", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Savoy:2005:CSM, author = "Jacques Savoy", title = "Comparative study of monolingual and multilingual search models for use with {Asian} languages", journal = j-TALIP, volume = "4", number = "2", pages = "159--185", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mase:2005:PTS, author = "Hisao Mase and Tadataka Matsubayashi and Yuichi Ogawa and Makoto Iwayama and Tadaaki Oshio", title = "Proposal of two-stage patent retrieval method considering the claim structure", journal = j-TALIP, volume = "4", number = "2", pages = "186--202", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Nakagawa:2005:PSI, author = "Hiroshi Nakagawa and Tatsunori Mori and Noriko Kando", title = "Preface to the special issues on {NTCIR-4}", journal = j-TALIP, volume = "4", number = "3", pages = "237--242", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kato:2005:ODQ, author = "Tsuneaki Kato and Jun'ichi Fukumoto and Fumito Masui and Noriko Kando", title = "Are open-domain question answering technologies useful for information access dialogues?---an empirical study and a proposal of a novel challenge", journal = j-TALIP, volume = "4", number = "3", pages = "243--262", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Isozaki:2005:AHP, author = "Hideki Isozaki", title = "An analysis of a high-performance {Japanese} question answering system", journal = j-TALIP, volume = "4", number = "3", pages = "263--279", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mori:2005:JQA, author = "Tatsunori Mori", title = "{Japanese} question-answering system using {A*} search and its improvement", journal = j-TALIP, volume = "4", number = "3", pages = "280--304", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mori:2005:MAF, author = "Tatsunori Mori and Masanori Nozawa and Yoshiaki Asada", title = "Multi-answer-focused multi-document summarization using a question-answering engine", journal = j-TALIP, volume = "4", number = "3", pages = "305--320", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Okazaki:2005:ICO, author = "Naoaki Okazaki and Yutaka Matsuo and Mitsuru Ishizuka", title = "Improving chronological ordering of sentences extracted from multiple newspaper articles", journal = j-TALIP, volume = "4", number = "3", pages = "321--339", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Yoshioka:2005:CPB, author = "Masaharu Yoshioka and Makoto Haraguchi", title = "On a combination of probabilistic and {Boolean} {IR} models for {WWW} document retrieval", journal = j-TALIP, volume = "4", number = "3", pages = "340--356", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lingpeng:2005:CIR, author = "Yang Lingpeng and Ji Donghong and Tang Li and Niu Zhengyu", title = "{Chinese} information retrieval based on terms and relevant terms", journal = j-TALIP, volume = "4", number = "3", pages = "357--374", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sakai:2006:ISI, author = "Tetsuya Sakai and Yuji Matsumoto", title = "Introduction to the special issue: {Recent} advances in information processing and access for {Japanese}", journal = j-TALIP, volume = "4", number = "4", pages = "375--376", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Doi:2006:EBM, author = "Takao Doi and Hirofumi Yamamoto and Eiichiro Sumita", title = "Example-based machine translation using efficient sentence retrieval based on edit-distance", journal = j-TALIP, volume = "4", number = "4", pages = "377--399", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Tomiura:2006:ESS, author = "Yoichi Tomiura and Shosaku Tanaka and Toru Hitaka", title = "Estimating satisfactoriness of selectional restriction from corpus without a thesaurus", journal = j-TALIP, volume = "4", number = "4", pages = "400--416", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Iida:2006:ARA, author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto", title = "Anaphora resolution by antecedent identification followed by anaphoricity determination", journal = j-TALIP, volume = "4", number = "4", pages = "417--434", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Inui:2006:ACK, author = "Takashi Inui and Kentaro Inui and Yuji Matsumoto", title = "Acquiring causal knowledge from text using the connective marker {\em tame\/}", journal = j-TALIP, volume = "4", number = "4", pages = "435--474", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Ma:2006:TSB, author = "Qiang Ma and Katsumi Tanaka", title = "Topic-structure-based complementary information retrieval and its application", journal = j-TALIP, volume = "4", number = "4", pages = "475--503", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Park:2006:ATM, author = "Jong C. Park and Gary Geunbae Lee and Limsoon Wong", title = "{AUTHOR}: {Text} mining and management in biomedicine", journal = j-TALIP, volume = "5", number = "1", pages = "1--3", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Park:2006:MBB, author = "Kyung-Mi Park and Seon-Ho Kim and Hae-Chang Rim and Young-Sook Hwang", title = "{ME}-based biomedical named entity recognition using lexical knowledge", journal = j-TALIP, volume = "5", number = "1", pages = "4--21", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Nenadic:2006:MSR, author = "Goran Nenadi{\'c} and Sophia Ananiadou", title = "Mining semantically related terms from biomedical literature", journal = j-TALIP, volume = "5", number = "1", pages = "22--43", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2006:ECI, author = "Jung-Jae Kim and Jong C. Park", title = "Extracting contrastive information from negation patterns in biomedical literature", journal = j-TALIP, volume = "5", number = "1", pages = "44--60", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2006:TPL, author = "Eunju Kim and Yu Song and Cheongjae Lee and Kyoungduk Kim and Gary Geunbae Lee and Byoung-Kee Yi and Jeongwon Cha", title = "Two-phase learning for biological event extraction and verification", journal = j-TALIP, volume = "5", number = "1", pages = "61--73", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mima:2006:TBK, author = "Hideki Mima and Sophia Ananiadou and Katsumori Matsushima", title = "Terminology-based knowledge mining for new knowledge discovery", journal = j-TALIP, volume = "5", number = "1", pages = "74--88", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Carpuat:2006:AWS, author = "Marine Carpuat and Pascale Fung and Grace Ngai", title = "Aligning word senses using bilingual corpora", journal = j-TALIP, volume = "5", number = "2", pages = "89--120", month = jun, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1165255.1165256", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The growing importance of multilingual information retrieval and machine translation has made multilingual ontologies extremely valuable resources. Since the construction of an ontology from scratch is a very expensive and time-consuming undertaking, it is attractive to consider ways of automatically aligning monolingual ontologies, which already exist for many of the world's major languages. Previous research exploited similarity in the structure of the ontologies to align, or manually created bilingual resources. These approaches cannot be used to align ontologies with vastly different structures and can only be applied to much studied language pairs for which expensive resources are already available. In this paper, we propose a novel approach to align the ontologies at the node level: Given a concept represented by a particular word sense in one ontology, our task is to find the best corresponding word sense in the second language ontology. To this end, we present a language-independent, corpus-based method that borrows from techniques used in information retrieval and machine translation. We show its efficiency by applying it to two very different ontologies in very different languages: the Mandarin Chinese HowNet and the American English WordNet. Moreover, we propose a methodology to measure bilingual corpora comparability and show that our method is robust enough to use noisy nonparallel bilingual corpora efficiently, when clean parallel corpora are not available.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lee:2006:ABN, author = "Chun-Jen Lee and Jason S. Chang and Jyh-Shing R. Jang", title = "Alignment of bilingual named entities in parallel corpora using statistical models and multiple knowledge sources", journal = j-TALIP, volume = "5", number = "2", pages = "121--145", month = jun, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1165255.1165257", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Named entity (NE) extraction is one of the fundamental tasks in natural language processing (NLP). Although many studies have focused on identifying NEs within monolingual documents, aligning NEs in bilingual documents has not been investigated extensively due to the complexity of the task. In this article we introduce a new approach to aligning bilingual NEs in parallel corpora by incorporating statistical models with multiple knowledge sources. In our approach, we model the process of translating an English NE phrase into a Chinese equivalent using lexical translation\slash transliteration probabilities for word translation and alignment probabilities for word reordering. The method involves automatically learning phrase alignment and acquiring word translations from a bilingual phrase dictionary and parallel corpora, and automatically discovering transliteration transformations from a training set of name-transliteration pairs. The method also involves language-specific knowledge functions, including handling abbreviations, recognizing Chinese personal names, and expanding acronyms. At runtime, the proposed models are applied to each source NE in a pair of bilingual sentences to generate and evaluate the target NE candidates; the source and target NEs are then aligned based on the computed probabilities. Experimental results demonstrate that the proposed approach, which integrates statistical models with extra knowledge sources, is highly feasible and offers significant improvement in performance compared to our previous work, as well as the traditional approach of IBM Model 4.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Shirado:2006:UJH, author = "Tamotsu Shirado and Satoko Marumoto and Masaki Murata and Hitoshi Isahara", title = "Using {Japanese} honorific expressions: a psychological study", journal = j-TALIP, volume = "5", number = "2", pages = "146--164", month = jun, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1165255.1165258", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We investigated, via experiment, knowledge of normative honorific expressions as used in textbooks and in practice by people. Forty subjects divided into four groups according to age (younger\slash older) and gender (male\slash female) participated in the experiments. The results show that knowledge about the use of normative honorific expressions in textbooks is similar to that demonstrated by the younger subject groups, but differed from that of the older subject groups. The knowledge of the older subjects was more complex than that shown in textbooks or demonstrated by the younger subjects. A model that can identify misuse of honorific expressions in sentences is the framework for this investigation. The model is minimal, but could represent 76\% to 92\% of the subjects' knowledge regarding each honorific element. This model will be useful in the development of computer-aided systems to help teach how honorific expressions should be used.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wu:2006:ERT, author = "Chung-Hsien Wu and Ze-Jing Chuang and Yu-Chung Lin", title = "Emotion recognition from text using semantic labels and separable mixture models", journal = j-TALIP, volume = "5", number = "2", pages = "165--183", month = jun, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1165255.1165259", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This study presents a novel approach to automatic emotion recognition from text. First, emotion generation rules (EGRs) are manually deduced from psychology to represent the conditions for generating emotion. Based on the EGRs, the emotional state of each sentence can be represented as a sequence of semantic labels (SLs) and attributes (ATTs); SLs are defined as the domain-independent features, while ATTs are domain-dependent. The emotion association rules (EARs) represented by SLs and ATTs for each emotion are automatically derived from the sentences in an emotional text corpus using the a priori algorithm. Finally, a separable mixture model (SMM) is adopted to estimate the similarity between an input sentence and the EARs of each emotional state. Since some features defined in this approach are domain-dependent, a dialog system focusing on the students' daily expressions is constructed, and only three emotional states, happy, unhappy, and neutral, are considered for performance evaluation. According to the results of the experiments, given the domain corpus, the proposed approach is promising, and easily ported into other domains.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Dale:2006:ISS, author = "Robert Dale", title = "Introduction to the {Special} section: {Extended} best papers from {IJCNLP 2005}", journal = j-TALIP, volume = "5", number = "3", pages = "183--184", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Oh:2006:MTM, author = "Jong-Hoon Oh and Key-Sun Choi and Hitoshi Isahara", title = "A machine transliteration model based on correspondence between graphemes and phonemes", journal = j-TALIP, volume = "5", number = "3", pages = "185--208", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Gao:2006:ESL, author = "Jianfeng Gao and Hisami Suzuki and Wei Yuan", title = "An empirical study on language model adaptation", journal = j-TALIP, volume = "5", number = "3", pages = "209--227", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Ye:2006:SRL, author = "Patrick Ye and Timothy Baldwin", title = "Semantic role labeling of prepositional phrases", journal = j-TALIP, volume = "5", number = "3", pages = "228--244", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chung:2006:APD, author = "Tze Leung Chung and Robert Wing Pong Luk and Kam Fai Wong and Kui Lam Kwok and Dik Lun Lee", title = "Adapting pivoted document-length normalization for query size: {Experiments} in {Chinese} and {English}", journal = j-TALIP, volume = "5", number = "3", pages = "245--263", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Matsumura:2006:ERB, author = "Atsushi Matsumura and Atsuhiro Takasu and Jun Adachi", title = "Effect of relationships between words on {Japanese} information retrieval", journal = j-TALIP, volume = "5", number = "3", pages = "264--289", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Song:2006:ISI, author = "Dawei Song and Jian-Yun Nie", title = "Introduction to special issue on reasoning in natural language information processing", journal = j-TALIP, volume = "5", number = "4", pages = "291--295", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Nie:2006:ILM, author = "Jian-Yun Nie and Guihong Cao and Jing Bai", title = "Inferential language models for information retrieval", journal = j-TALIP, volume = "5", number = "4", pages = "296--322", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Gao:2006:SQT, author = "Jianfeng Gao and Jian-Yun Nie and Ming Zhou", title = "Statistical query translation models for cross-language information retrieval", journal = j-TALIP, volume = "5", number = "4", pages = "323--359", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Liu:2006:SFQ, author = "Yi Liu and Rong Jin and Joyce Y. Chai", title = "A statistical framework for query translation disambiguation", journal = j-TALIP, volume = "5", number = "4", pages = "360--387", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2006:TTT, author = "Baoli Li and Wenjie Li and Qin Lu", title = "Topic tracking with time granularity reasoning", journal = j-TALIP, volume = "5", number = "4", pages = "388--412", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Phan:2006:IDS, author = "Xuan-Hieu Phan and Le-Minh Nguyen and Yasushi Inoguchi and Tu-Bao Ho and Susumu Horiguchi", title = "Improving discriminative sequential learning by discovering important association of statistics", journal = j-TALIP, volume = "5", number = "4", pages = "413--438", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chen:2007:UDM, author = "Yong Chen and Kwok-Ping Chan", title = "Using data mining techniques and rough set theory for language modeling", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Hsu:2007:MSB, author = "Chung-Chian Hsu and Chien-Hsing Chen and Tien-Teng Shih and Chun-Kai Chen", title = "Measuring similarity between transliterations against noise data", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sakai:2007:RFQ, author = "Tetsuya Sakai", title = "On the reliability of factoid question answering evaluation", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wiseman:2007:CBC, author = "Yair Wiseman and Irit Gefner", title = "Conjugation-based compression for {Hebrew} texts", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wu:2007:TBS, author = "Chung-Hsien Wu and Hung-Yu Su and Yu-Hsien Chiu and Chia-Hung Lin", title = "Transfer-based statistical translation of {Taiwanese} sign language using {PCFG}", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kuo:2007:PSM, author = "Jin-Shea Kuo and Haizhou Li and Ying-Kuei Yang", title = "A phonetic similarity model for automatic extraction of transliteration pairs", journal = j-TALIP, volume = "6", number = "2", pages = "6:1--6:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1282080.1282081", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article proposes an approach for the automatic extraction of transliteration pairs from Chinese Web corpora. In this approach, we formulate the machine transliteration process using a syllable-based phonetic similarity model which consists of phonetic confusion matrices and a Chinese character n -gram language model. With the phonetic similarity model, the extraction of transliteration pairs becomes a two-step process of recognition followed by validation: First, in the recognition process, we identify the most probable transliteration in the k -neighborhood of a recognized English word. Then, in the validation process, we qualify the transliteration pair candidates with a hypothesis test. We carry out an analytical study on the statistics of several key factors in English--Chinese transliteration to help formulate phonetic similarity modeling. We then conduct both supervised and unsupervised learning of a phonetic similarity model on a development database. The experimental results validate the effectiveness of the phonetic similarity model by achieving an $F$-measure of 0.739 in supervised learning. The unsupervised learning approach works almost as well as the supervised one, thus allowing us to deploy automatic extraction of transliteration pairs in the Web space.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "extraction of transliteration pairs; machine translation; machine transliteration; phonetic confusion probability; phonetic similarity modeling", } @Article{Xiao:2007:SNM, author = "Jinghui Xiao and Xiaolong Wang and Bingquan Liu", title = "The study of a nonstationary maximum entropy {Markov} model and its application on the pos-tagging task", journal = j-TALIP, volume = "6", number = "2", pages = "7:1--7:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1282080.1282082", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Sequence labeling is a core task in natural language processing. The maximum entropy Markov model (MEMM) is a powerful tool in performing this task. This article enhances the traditional MEMM by exploiting the positional information of language elements. The stationary hypothesis is relaxed in MEMM, and the nonstationary MEMM (NS-MEMM) is proposed. Several related issues are discussed in detail, including the representation of positional information, NS-MEMM implementation, smoothing techniques, and the space complexity issue. Furthermore, the asymmetric NS-MEMM presents a more flexible way to exploit positional information. In the experiments, NS-MEMM is evaluated on both the Chinese and the English pos-tagging tasks. According to the experimental results, NS-MEMM yields effective improvements over MEMM by exploiting positional information. The smoothing techniques in this article effectively solve the NS-MEMM data-sparseness problem; the asymmetric NS-MEMM is also an improvement by exploiting positional information in a more flexible way.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "data sparseness problem; Markov property; MEMM; pos-tagging; stationary hypothesis", } @Article{Zhuang:2007:IHD, author = "Yl Zhuang and Yueting Zhuang and Qing Li and Lei Chen", title = "Interactive high-dimensional index for large {Chinese} calligraphic character databases", journal = j-TALIP, volume = "6", number = "2", pages = "8:1--8:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1282080.1282083", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The large numbers of Chinese calligraphic scripts in existence are valuable part of the Chinese cultural heritage. However, due to the shape complexity of these characters, it is hard to employ existing techniques to effectively retrieve and efficiently index them. In this article, using a novel shape-similarity- based retrieval method in which shapes of calligraphic characters are represented by their contour points extracted from the character images, we propose an interactive partial-distance-map (PDM)- based high-dimensional indexing scheme which is designed specifically to speed up the retrieval performance of the large Chinese calligraphic character databases effectively. Specifically, we use the approximate minimal bounding sphere of a query character and utilize users' relevance feedback to refine the query gradually. Comprehensive experiments are conducted to testify the efficiency and effectiveness of this method. In addition, a new $k$-NN search called Pseudo $k$-NN (P $k$-NN) search is presented to better facilitate the PDM-based character retrieval.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Chinese calligraphic character; hyper-centre relocation; Pseudo k-NN", } @Article{Saraswathi:2007:CPE, author = "S. Saraswathi and T. V. Geetha", title = "Comparison of performance of enhanced morpheme-based language model with different word-based language models for improving the performance of {Tamil} speech recognition system", journal = j-TALIP, volume = "6", number = "3", pages = "9:1--9:??", month = nov, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1290002.1290003", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This paper describes a new technique of language modeling for a highly inflectional Dravidian language, Tamil. It aims to alleviate the main problems encountered in processing of Tamil language, like enormous vocabulary growth caused by the large number of different forms derived from one word. The size of the vocabulary was reduced by, decomposing the words into stems and endings and storing these sub word units (morphemes) in the vocabulary separately. A enhanced morpheme-based language model was designed for the inflectional language Tamil. The enhanced morpheme-based language model was trained on the decomposed corpus. The perplexity and Word Error Rate (WER) were obtained to check the efficiency of the model for Tamil speech recognition system. The results were compared with word-based bigram and trigram language models, distance based language model, dependency based language model and class based language model. From the results it was analyzed that the enhanced morpheme-based trigram model with Katz back-off smoothing effect improved the performance of the Tamil speech recognition system when compared to the word-based language models.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "language model; morphemes; perplexity; word error rate and speech recognition", } @Article{Hussain:2007:DLS, author = "Sarmad Hussain and Sana Gul and Afifah Waseem", title = "Developing lexicographic sorting: {An} example for {Urdu}", journal = j-TALIP, volume = "6", number = "3", pages = "10:1--10:??", month = nov, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1290002.1290004", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Collation or lexicographic sorting is essential to develop multilingual computing. This paper presents the challenges faced in developing collation sequence for a language. The paper discusses both theoretical linguistic and practical standardization and encoding related considerations that need to be addressed for languages for which relevant standards and/or solutions have not been defined. The paper also defines the process, by giving the details of the procedure followed for Urdu language, which is the national language of Pakistan and is spoken by more than 100 million people across the world. The paper is oriented towards organizations involved in developing and using collation standards and the localization industry, and not focused on theoretical issues.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "text processing; Urdu", } @Article{Fukumoto:2007:TTB, author = "Fumiyo Fukumoto and Yoshimi Suzuki", title = "Topic tracking based on bilingual comparable corpora and semisupervised clustering", journal = j-TALIP, volume = "6", number = "3", pages = "11:1--11:??", month = nov, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1290002.1290005", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this paper, we address the problem of skewed data in topic tracking: the small number of stories labeled positive as compared to negative stories and propose a method for estimating effective training stories for the topic-tracking task. For a small number of labeled positive stories, we use bilingual comparable, i.e., English, and Japanese corpora, together with the EDR bilingual dictionary, and extract story pairs consisting of positive and associated stories. To overcome the problem of a large number of labeled negative stories, we classified them into clusters. This is done using a semisupervised clustering algorithm, combining $k$ means with EM. The method was tested on the TDT English corpus and the results showed that the system works well when the topic under tracking is talking about an event originating in the source language country, even for a small number of initial positive training stories.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "bilingual comparable corpora; clustering; EM algorithm; N-gram model; topic detection and tracking", } @Article{Iida:2007:ZAR, author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto", title = "Zero-anaphora resolution by learning rich syntactic pattern features", journal = j-TALIP, volume = "6", number = "4", pages = "1:1--1:22", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1316457.1316458", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We approach the zero-anaphora resolution problem by decomposing it into intrasentential and intersentential zero-anaphora resolution tasks. For the former task, syntactic patterns of zeropronouns and their antecedents are useful clues. Taking Japanese as a target language, we empirically demonstrate that incorporating rich syntactic pattern features in a state-of-the-art learning-based anaphora resolution model dramatically improves the accuracy of intrasentential zero-anaphora, which consequently improves the overall performance of zero-anaphora resolution.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Adriani:2007:SIC, author = "Mirna Adriani and Jelita Asian and Bobby Nazief and S. M. M. Tahaghoghi and Hugh E. Williams", title = "Stemming {Indonesian}: a confix-stripping approach", journal = j-TALIP, volume = "6", number = "4", pages = "2:1--2:33", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1316457.1316458", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Stemming words to (usually) remove suffixes has applications in text search, machine translation, document summarization, and text classification. For example, English stemming reduces the words 'computer,' 'computing,' 'computation,' and 'computability' to their common morphological root, 'comput-.' In text search, this permits a search for 'computers' to find documents containing all words with the stem 'comput-.' In the Indonesian language, stemming is of crucial importance: words have prefixes, suffixes, infixes, and confixes that make matching related words difficult.\par This work surveys existing techniques for stemming Indonesian words to their morphological roots, presents our novel and highly accurate CS algorithm, and explores the effectiveness of stemming in the context of general-purpose text information retrieval through ad hoc queries.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Indonesian; information retrieval; stemming", } @Article{Thao:2007:NER, author = "Pham Thi Xuan Thao and Tran Quoc Tri and Dinh Dien and Nigel Collier", title = "Named entity recognition in {Vietnamese} using classifier voting", journal = j-TALIP, volume = "6", number = "4", pages = "3:1--3:18", month = dec, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1316457.1316460", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Named entity recognition (NER) is one of the fundamental tasks in natural-language processing (NLP). Though the combination of different classifiers has been widely applied in several well-studied languages, this is the first time this method has been applied to Vietnamese. In this article, we describe how voting techniques can improve the performance of Vietnamese NER. By combining several state-of-the-art machine-learning algorithms using voting strategies, our final result outperforms individual algorithms and gained an $F$-measure of 89.12. A detailed discussion about the challenges of NER in Vietnamese is also presented.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "C4.5; Conditional Random Fields; Na{\"\i}ve Bayes named entity recognition; support vector machines; transformation based learning; Vietnamese; voting", } @Article{Chen:2008:SBM, author = "Yufeng Chen and Chengqing Zong", title = "A Structure-Based Model for {Chinese} Organization Name Translation", journal = j-TALIP, volume = "7", number = "1", pages = "1:1--1:??", month = feb, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1330291.1330292", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Named entity (NE) translation is a fundamental task in multilingual natural language processing. The performance of a machine translation system depends heavily on precise translation of the inclusive NEs. Furthermore, organization name (ON) is the most complex NE for translation among all the NEs. In this article, the structure formulation of ONs is investigated and a hierarchical structure-based ON translation model for Chinese-to-English translation system is presented.\par First, the model performs ON chunking; then both the translation of words within chunks and the process of chunk-reordering are achieved by synchronous context-free grammar (CFG). The CFG rules are extracted from bilingual ON pairs in a training program.\par The main contributions of this article are: (1) defining appropriate chunk-units for analyzing the internal structure of Chinese ONs; (2) making the chunk-based ON translation feasible and flexible via a hierarchical CFG derivation; and (3) proposing a training architecture to automatically learn the synchronous CFG for constructing ONs with chunk-units from aligned bilingual ON pairs. The experiments show that the proposed approach translates the Chinese ONs into English with an accuracy of 93.75\% and significantly improves the performance of a baseline statistical machine translation (SMT) system.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "alignment; chunk; hierarchical derivation; machine translation; named entity; organization name; rules extraction; structural analysis; synchronous context-free grammar", } @Article{Jeong:2008:ISR, author = "Minwoo Jeong and Gary Geunbae Lee", title = "Improving Speech Recognition and Understanding using Error-Corrective Reranking", journal = j-TALIP, volume = "7", number = "1", pages = "2:1--2:??", month = feb, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1330291.1330293", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The main issues of practical spoken-language applications for human-computer interface are how to overcome speech recognition errors and guarantee the reasonable end-performance of spoken-language applications. Therefore, handling the erroneously recognized outputs is a key in developing robust spoken-language systems. To address this problem, we present a method to improve the accuracy of speech recognition and performance of spoken-language applications. The proposed error corrective reranking approach exploits recognition environment characteristics and domain-specific semantic information to provide robustness and adaptability for a spoken-language system. We demonstrate some experiments of spoken dialogue tasks and empirical results that show an improvement in accuracy for both speech recognition and spoken-language understanding. In our experiment, we show an error reduction of up to 9.7\% and 16.8\%; of word error rate, and 5.5\% and 7.9\% of understanding error for the air travel and telebanking service domains.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "automatic speech recognition; error-corrective reranking; improving spoken dialogue system; spoken-language understanding", } @Article{Kuo:2008:MSG, author = "June-Jei Kuo and Hsin-Hsi Chen", title = "Multidocument Summary Generation: Using Informative and Event Words", journal = j-TALIP, volume = "7", number = "1", pages = "3:1--3:??", month = feb, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1330291.1330294", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Summary generation for multiple documents poses a number of issues including sentence selection, sentence ordering, and sentence reduction over single-document summarization. In addition, the temporal resolution among extracted sentences is also important. This article considers informative words and event words to deal with multidocument summarization. These words indicate the important concepts and relationships in a document or among a set of documents, and can be used to select salient sentences. We present a temporal resolution algorithm, using focusing time and coreference chains, to convert Chinese temporal expressions in a document into calendrical forms. Moreover, we consider the last calendrical form of a sentence as a sentence time stamp to address sentence ordering. Informative words, event words, and temporal words are introduced to a sentence reduction algorithm, which deals with both length constraints and information coverage. Experiments on Chinese-news data sets show significant improvements of both information coverage and readability.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "latent semantic analysis; multidocument summary generation; sentence ordering; sentence reduction; sentence selection; temporal processing", } @Article{Kando:2008:INS, author = "Noriko Kando and Teruko Mitamura and Tetsuya Sakai", title = "Introduction to the {NTCIR-6 Special Issue}", journal = j-TALIP, volume = "7", number = "2", pages = "4:1--4:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1362782.1362783", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhou:2008:HTE, author = "Dong Zhou and Mark Truran and Tim Brailsford and Helen Ashman", title = "A Hybrid Technique for {English--Chinese} Cross Language Information Retrieval", journal = j-TALIP, volume = "7", number = "2", pages = "5:1--5:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1362782.1362784", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this article we describe a hybrid technique for dictionary-based query translation suitable for English--Chinese cross language information retrieval. This technique marries a graph-based model for the resolution of candidate term ambiguity with a pattern-based method for the translation of out-of-vocabulary (OOV) terms. We evaluate the performance of this hybrid technique in an experiment using several NTCIR test collections. Experimental results indicate a substantial increase in retrieval effectiveness over various baseline systems incorporating machine- and dictionary-based translation.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "cross language information retrieval; disambiguation; graph-based analysis; patterns; unknown term translation", } @Article{Higashinaka:2008:AAC, author = "Ryuichiro Higashinaka and Hideki Isozaki", title = "Automatically Acquiring Causal Expression Patterns from Relation-annotated Corpora to Improve Question Answering for why-Questions", journal = j-TALIP, volume = "7", number = "2", pages = "6:1--6:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1362782.1362785", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article describes our approach for answering why-questions that we initially introduced at NTCIR-6 QAC-4. The approach automatically acquires causal expression patterns from relation-annotated corpora by abstracting text spans annotated with a causal relation and by mining syntactic patterns that are useful for distinguishing sentences annotated with a causal relation from those annotated with other relations. We use these automatically acquired causal expression patterns to create features to represent answer candidates, and use these features together with other possible features related to causality to train an answer candidate ranker that maximizes the QA performance with regards to the corpus of why-questions and answers. NAZEQA, a Japanese why-QA system based on our approach, clearly outperforms baselines with a Mean Reciprocal Rank (top-5) of 0.223 when sentences are used as answers and with a MRR (top-5) of 0.326 when paragraphs are used as answers, making it presumably the best-performing fully implemented why-QA system. Experimental results also verified the usefulness of the automatically acquired causal expression patterns.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "causal expression; pattern mining; question answering; relation-annotated corpus", } @Article{Li:2008:ASV, author = "Yaoyong Li and Kalina Bontcheva", title = "Adapting Support Vector Machines for ${F}$-term-based Classification of Patents", journal = j-TALIP, volume = "7", number = "2", pages = "7:1--7:??", month = jun, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1362782.1362786", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Support Vector Machines (SVM) have obtained state-of-the-art results on many applications including document classification. However, previous works on applying SVMs to the $F$-term patent classification task did not obtain as good results as other learning algorithms such as k-NN. This is due to the fact that $F$-term patent classification is different from conventional document classification in several aspects, mainly because it is a multiclass, multilabel classification problem with semi-structured documents and multi-faceted hierarchical categories.\par This article describes our SVM-based system and several techniques we developed successfully to adapt SVM for the specific features of the $F$-term patent classification task. We evaluate the techniques using the NTCIR-6 $F$-term classification terms assigned to Japanese patents. Moreover, our system participated in the NTCIR-6 patent classification evaluation and obtained the best results according to two of the three metrics used for task performance evaluation. Following the NTCIR-6 participation, we developed two new techniques, which achieved even better scores using all three NTCIR-6 metrics, effectively outperforming all participating systems. This article presents this new work and the experimental results that demonstrate the benefits of the latest approach.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "F-term classification; patent processing; support vector machines", } @Article{Fukumoto:2008:ICL, author = "Fumiyo Fukumoto and Yoshimi Suzuki", title = "Integrating Cross-Language Hierarchies and Its Application to Retrieving Relevant Documents", journal = j-TALIP, volume = "7", number = "3", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386869.1386870", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Internet directories such as Yahoo! are an approach to improve the efficacy and efficiency of Information Retrieval (IR) on the Web, as pages (documents) are organized into hierarchical categories, and similar pages are grouped together. Most of the search engines on the Web service find documents that are assigned to a single classification hierarchy. Categories in the hierarchy are carefully defined by human experts and documents are well organized. However, a single hierarchy in one language is often insufficient to find all relevant material, as each hierarchy tends to have some bias in both defining hierarchical structure and classifying documents. Moreover, documents written in a language other than the user's native language often include large amounts of information related to the user's request. In this article, we propose a method of integrating cross-language (CL) category hierarchies, that is, Reuters '96 hierarchy and UDC code hierarchy of Japanese by estimating category similarities. The method does not simply merge two different hierarchies into one large hierarchy but instead extracts sets of similar categories, where each element of the sets is relevant with each other. It consists of three steps. First, we classify documents from one hierarchy into categories with another hierarchy using a cross-language text classification (CLTC) technique, and extract category pairs of two hierarchies. Next, we apply $\chi^2$ statistics to these pairs to obtain similar category pairs, and finally we apply the generating function of the Apriori algorithm (Apriori-Gen) to the category pairs, and find sets of similar categories. Moreover, we examined whether integrating hierarchies helps to support retrieval of documents with similar contents. The retrieval results showed a 42.7\% improvement over the baseline nonhierarchy model, and a 21.6\% improvement over a single hierarchy.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "cross-language hierarchies; information integration; retrieval of relevant documents; text classification", } @Article{Sharma:2008:AMI, author = "Utpal Sharma and Jugal K. Kalita and Rajib K. Das", title = "Acquisition of Morphology of an {Indic} Language from Text Corpus", journal = j-TALIP, volume = "7", number = "3", pages = "9:1--9:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386869.1386871", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article describes an approach to unsupervised learning of morphology from an unannotated corpus for a highly inflectional Indo-European language called Assamese spoken by about 30 million people. Although Assamese is one of India's national languages, it utterly lacks computational linguistic resources. There exists no prior computational work on this language spoken widely in northeast India. The work presented is pioneering in this respect. In this article, we discuss salient issues in Assamese morphology where the presence of a large number of suffixal determiners, sandhi, samas, and the propensity to use suffix sequences make approximately 50\% of the words used in written and spoken text inflected. We implement methods proposed by Gaussier and Goldsmith on acquisition of morphological knowledge, and obtain F-measure performance below 60\%. This motivates us to present a method more suitable for handling suffix sequences, enabling us to increase the F-measure performance of morphology acquisition to almost 70\%. We describe how we build a morphological dictionary for Assamese from the text corpus. Using the morphological knowledge acquired and the morphological dictionary, we are able to process small chunks of data at a time as well as a large corpus. We achieve approximately 85\% precision and recall during the analysis of small chunks of coherent text.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Assamese; Indo-European languages; machine learning; morphology", } @Article{Chen:2008:TTR, author = "Jiang-Chun Chen and Jyh-Shing Roger Jang", title = "{TRUES}: {Tone Recognition Using Extended Segments}", journal = j-TALIP, volume = "7", number = "3", pages = "10:1--10:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1386869.1386872", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Tone recognition has been a basic but important task for speech recognition and assessment of tonal languages, such as Mandarin Chinese. Most previously proposed approaches adopt a two-step approach where syllables within an utterance are identified via forced alignment first, and tone recognition using a variety of classifiers---such as neural networks, Gaussian mixture models (GMM), hidden Markov models (HMM), support vector machines (SVM)---is then performed on each segmented syllable to predict its tone. However, forced alignment does not always generate accurate syllable boundaries, leading to unstable voiced-unvoiced detection and deteriorating performance in tone recognition. Aiming to alleviate this problem, we propose a robust approach called Tone Recognition Using Extended Segments (TRUES) for HMM-based continuous tone recognition. The proposed approach extracts an unbroken pitch contour from a given utterance based on dynamic programming over time-domain acoustic features of average magnitude difference function (AMDF). The pitch contour of each syllable is then extended for tri-tone HMM modeling, such that the influence from inaccurate syllable boundaries is lessened. Our experimental results demonstrate that the proposed TRUES achieves 49.13\% relative error rate reduction over that of the recently proposed supratone modeling, which is deemed the state of the art of tone recognition that outperforms several previously proposed approaches. The encouraging improvement demonstrates the effectiveness and robustness of the proposed TRUES, as well as the corresponding pitch determination algorithm which produces unbroken pitch contours.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "context-dependent tone modeling; continuous tone recognition; extended segment for tone recognition; HMM; Mandarin Chinese; supratone modeling", } @Article{Lin:2008:VCD, author = "Jeng-Wei Lin and Jan-Ming Ho and Li-Ming Tseng and Feipei Lai", title = "Variant {Chinese} Domain Name Resolution", journal = j-TALIP, volume = "7", number = "4", pages = "11:1--11:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1450295.1450296", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Many efforts in past years have been made to lower the linguistic barriers for non-native English speakers to access the Internet. Internet standard RFC 3490, referred to as IDNA (Internationalizing Domain Names in Applications), focuses on access to IDNs (Internationalized Domain Names) in a range of scripts that is broader in scope than the original ASCII. However, the use of character variants that have similar appearances and/or interpretations could create confusion. A variant IDL (Internationalized Domain Label), derived from an IDL by replacing some characters with their variants, should match the original IDL; and thus a variant IDN does. In RFC 3743, referred to as JET (Joint Engineering Team) Guidelines, it is suggested that zone administrators model this concept of equivalence as an atomic IDL package. When an IDL is registered, an IDL package is created that contains its variant IDLs generated according to the zone-specific Language Variant Tables (LVTs). In addition to the registered IDL, the name holder can request the domain registry to activate some of the variant IDLs, free or by an extra fee. The activated variant IDLs are stored in the zone files, and thus become resolvable. However, an issue of scalability arises when there is a large number of variant IDLs to be activated.\par In this article, the authors present a resolution protocol that resolves the variant IDLs into the registered IDL, specifically for Han character variants. Two Han characters are said to be variants of each other if they have the same meaning and are pronounced the same. Furthermore, Han character variants usually have similar appearances. It is not uncommon that a Chinese IDL has a large number of variant IDLs. The proposed protocol introduces a new RR (resource record) type, denoted as VarIdx RR, to associate a variant expression of the variant IDLs with the registered IDL. The label of the VarIdx RR, denoted as the variant index, is assigned by an indexing function that is designed to give the same value to all of the variant IDLs enumerated by the variant expression. When one of the variant IDLs is accessed, Internet applications can compute the variant index, look up the VarIdx RRs, and resolve the variant IDL into the registered IDL.\par The authors examine two sets of Chinese IDLs registered in TWNIC and CNNIC, respectively. The results show that for a registered Chinese IDL, a very small number of VarIdx RRs, usually one or two, are sufficient to activate all of its variant IDLs. The authors also represent a Web redirection service that employs the proposed resolution protocol to redirect a URL addressed by a variant IDN to the URL addressed by the registered IDN. The experiment results show that the proposed protocol successfully resolves the variant IDNs into the registered IDNs.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "conversion between traditional Chinese and simplified Chinese; Han character folding; Han character variant; IDN spoof; internationalized domain name; localization", } @Article{Lee:2008:BCQ, author = "Cheng-Wei Lee and Min-Yuh Day and Cheng-Lung Sung and Yi-Hsun Lee and Tian-Jian Jiang and Chia-Wei Wu and Cheng-Wei Shih and Yu-Ren Chen and Wen-Lian Hsu", title = "Boosting {Chinese} Question Answering with Two Lightweight Methods: {ABSPs} and {SCO-QAT}", journal = j-TALIP, volume = "7", number = "4", pages = "12:1--12:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1450295.1450297", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Question Answering (QA) research has been conducted in many languages. Nearly all the top performing systems use heavy methods that require sophisticated techniques, such as parsers or logic provers. However, such techniques are usually unavailable or unaffordable for under-resourced languages or in resource-limited situations. In this article, we describe how a top-performing Chinese QA system can be designed by using lightweight methods effectively. We propose two lightweight methods, namely the Sum of Co-occurrences of Question and Answer Terms (SCO-QAT) and Alignment-based Surface Patterns (ABSPs). SCO-QAT is a co-occurrence-based answer-ranking method that does not need extra knowledge, word-ignoring heuristic rules, or tools. It calculates co-occurrence scores based on the passage retrieval results. ABSPs are syntactic patterns trained from question-answer pairs with a multiple alignment algorithm. They are used to capture the relations between terms and then use the relations to filter answers. We attribute the success of the ABSPs and SCO-QAT methods to the effective use of local syntactic information and global co-occurrence information.\par By using SCO-QAT and ABSPs, we improved the RU-Accuracy of our testbed QA system, ASQA, from 0.445 to 0.535 on the NTCIR-5 dataset. It also achieved the top 0.5 RU-Accuracy on the NTCIR-6 dataset. The result shows that lightweight methods are not only cheaper to implement, but also have the potential to achieve state-of-the-art performances.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "answer filtering; answer ranking; Chinese question answering; co-occurrence; lightweight method; surface pattern", } @Article{Che:2008:UHC, author = "Wanxiang Che and Min Zhang and AiTi Aw and ChewLim Tan and Ting Liu and Sheng Li", title = "Using a Hybrid Convolution Tree Kernel for Semantic Role Labeling", journal = j-TALIP, volume = "7", number = "4", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1450295.1450298", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "As a kind of Shallow Semantic Parsing, Semantic Role Labeling (SRL) is gaining more attention as it benefits a wide range of natural language processing applications. Given a sentence, the task of SRL is to recognize semantic arguments (roles) for each predicate (target verb or noun). Feature-based methods have achieved much success in SRL and are regarded as the state-of-the-art methods for SRL. However, these methods are less effective in modeling structured features. As an extension of feature-based methods, kernel-based methods are able to capture structured features more efficiently in a much higher dimension. Application of kernel methods to SRL has been achieved by selecting the tree portion of a predicate and one of its arguments as feature space, which is named as predicate-argument feature (PAF) kernel. The PAF kernel captures the syntactic tree structure features using convolution tree kernel, however, it does not distinguish between the path structure and the constituent structure. In this article, a hybrid convolution tree kernel is proposed to model different linguistic objects. The hybrid convolution tree kernel consists of two individual convolution tree kernels. They are a Path kernel, which captures predicate-argument link features, and a Constituent Structure kernel, which captures the syntactic structure features of arguments. Evaluations on the data sets of the CoNLL-2005 SRL shared task and the Chinese PropBank (CPB) show that our proposed hybrid convolution tree kernel statistically significantly outperforms the previous tree kernels. Moreover, in order to maximize the system performance, we present a composite kernel through combining our hybrid convolution tree kernel method with a feature-based method extended by the polynomial kernel. The experimental results show that the composite kernel achieves better performance than each of the individual methods and outperforms the best reported system on the CoNLL-2005 corpus when only one syntactic parser is used and on the CPB corpus when automated syntactic parse results and correct syntactic parse results are used respectively.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "hybrid convolution tree kernel; semantic role labeling", } @Article{Wu:2009:ISI, author = "Chung-Hsien Wu and Haizhou Li", title = "Introduction to the Special Issue on Recent Advances in {Asian} Language Spoken Document Retrieval", journal = j-TALIP, volume = "8", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1482343.1482344", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chen:2009:WTM, author = "Berlin Chen", title = "Word Topic Models for Spoken Document Retrieval and Transcription", journal = j-TALIP, volume = "8", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1482343.1482345", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Statistical language modeling (LM), which aims to capture the regularities in human natural language and quantify the acceptability of a given word sequence, has long been an interesting yet challenging research topic in the speech and language processing community. It also has been introduced to information retrieval (IR) problems, and provided an effective and theoretically attractive probabilistic framework for building IR systems. In this article, we propose a word topic model (WTM) to explore the co-occurrence relationship between words, as well as the long-span latent topical information, for language modeling in spoken document retrieval and transcription. The document or the search history as a whole is modeled as a composite WTM model for generating a newly observed word. The underlying characteristics and different kinds of model structures are extensively investigated, while the performance of WTM is thoroughly analyzed and verified by comparison with the well-known probabilistic latent semantic analysis (PLSA) model as well as the other models. The IR experiments are performed on the TDT Chinese collections (TDT-2 and TDT-3), while the large vocabulary continuous speech recognition (LVCSR) experiments are conducted on the Mandarin broadcast news collected in Taiwan. Experimental results seem to indicate that WTM is a promising alternative to the existing models.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "adaptation; information retrieval; language model; speech recognition; word topic model", } @Article{Lin:2009:CSP, author = "Shih-Hsiang Lin and Berlin Chen and Hsin-Min Wang", title = "A Comparative Study of Probabilistic Ranking Models for {Chinese} Spoken Document Summarization", journal = j-TALIP, volume = "8", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1482343.1482346", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Extractive document summarization automatically selects a number of indicative sentences, passages, or paragraphs from an original document according to a target summarization ratio, and sequences them to form a concise summary. In this article, we present a comparative study of various probabilistic ranking models for spoken document summarization, including supervised classification-based summarizers and unsupervised probabilistic generative summarizers. We also investigate the use of unsupervised summarizers to improve the performance of supervised summarizers when manual labels are not available for training the latter. A novel training data selection approach that leverages the relevance information of spoken sentences to select reliable document-summary pairs derived by the probabilistic generative summarizers is explored for training the classification-based summarizers. Encouraging initial results on Mandarin Chinese broadcast news data are demonstrated.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "extractive summarization; probabilistic ranking models; relevance information; spoken document summarization", } @Article{Chen:2009:TSH, author = "Boxing Chen and Min Zhang and Ai Ti Aw", title = "Two-Stage Hypotheses Generation for Spoken Language Translation", journal = j-TALIP, volume = "8", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1482343.1482347", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Spoken Language Translation (SLT) is the research area that focuses on the translation of speech or text between two spoken languages. Phrase-based and syntax-based methods represent the state-of-the-art for statistical machine translation (SMT). The phrase-based method specializes in modeling local reorderings and translations of multiword expressions. The syntax-based method is enhanced by using syntactic knowledge, which can better model long word reorderings, discontinuous phrases, and syntactic structure. In this article, we leverage on the strength of these two methods and propose a strategy based on multiple hypotheses generation in a two-stage framework for spoken language translation. The hypotheses are generated in two stages, namely, decoding and regeneration. In the decoding stage, we apply state-of-the-art, phrase-based, and syntax-based methods to generate basic translation hypotheses. Then in the regeneration stage, much more hypotheses that cannot be captured by the decoding algorithms are produced from the basic hypotheses. We study three regeneration methods: redecoding, n-gram expansion, and confusion network in the second stage. Finally, an additional reranking pass is introduced to select the translation outputs by a linear combination of rescoring models. Experimental results on the Chinese-to-English IWSLT-2006 challenge task of translating the transcription of spontaneous speech show that the proposed mechanism achieves significant improvements over the baseline of about 2.80 BLEU-score.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "hypotheses generation; spoken language translation; statistical machine translation", } @Article{Chiang:2009:ISI, author = "David Chiang and Philipp Koehn", title = "Introduction to the Special Issue on Machine Translation of {Asian} Language", journal = j-TALIP, volume = "8", number = "2", pages = "5:1--5:??", month = may, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1526252.1526253", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{He:2009:IMH, author = "Xiaodong He and Mei Yang and Jianfeng Gao and Patrick Nguyen and Robert Moore", title = "Improved Monolingual Hypothesis Alignment for Machine Translation System Combination", journal = j-TALIP, volume = "8", number = "2", pages = "6:1--6:??", month = may, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1526252.1526254", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a new hypothesis alignment method for combining outputs of multiple machine translation (MT) systems. An indirect hidden Markov model (IHMM) is proposed to address the synonym matching and word ordering issues in hypothesis alignment. Unlike traditional HMMs whose parameters are trained via maximum likelihood estimation (MLE), the parameters of the IHMM are estimated indirectly from a variety of sources including word semantic similarity, word surface similarity, and a distance-based distortion penalty. The IHMM-based method significantly outperforms the state-of-the-art, TER-based alignment model in our experiments on NIST benchmark datasets. Our combined SMT system using the proposed method achieved the best Chinese-to-English translation result in the constrained training track of the 2008 NIST Open MT Evaluation.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "hidden Markov model; statistical machine translation; system combination; word alignment", } @Article{Ma:2009:BMW, author = "Yanjun Ma and Andy Way", title = "Bilingually Motivated Word Segmentation for Statistical Machine Translation", journal = j-TALIP, volume = "8", number = "2", pages = "7:1--7:??", month = may, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1526252.1526255", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We introduce a bilingually motivated word segmentation approach to languages where word boundaries are not orthographically marked, with application to Phrase-Based Statistical Machine Translation (PB-SMT). Our approach is motivated from the insight that PB-SMT systems can be improved by optimizing the input representation to reduce the predictive power of translation models. We firstly present an approach to optimize the existing segmentation of both source and target languages for PB-SMT and demonstrate the effectiveness of this approach using a Chinese--English MT task, that is, to measure the influence of the segmentation on the performance of PB-SMT systems. We report a 5.44\% relative increase in Bleu score and a consistent increase according to other metrics. We then generalize this method for Chinese word segmentation without relying on any segmenters and show that using our segmentation PB-SMT can achieve more consistent state-of-the-art performance across two domains. There are two main advantages of our approach. First of all, it is adapted to the specific translation task at hand by taking the corresponding source (target) language into account. Second, this approach does not rely on manually segmented training data so that it can be automatically adapted for different domains.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "alignment; bilingually motivated; phrase-based statistical machine translation; word segmentation", } @Article{Venkatapathy:2009:DMT, author = "Sriram Venkatapathy and Srinivas Bangalore", title = "Discriminative Machine Translation Using Global Lexical Selection", journal = j-TALIP, volume = "8", number = "2", pages = "8:1--8:??", month = may, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1526252.1526256", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Statistical phrase-based machine translation models crucially rely on word alignments. The search for word-alignments assumes a model of word locality between source and target languages that is violated in starkly different word-order languages such as English-Hindi. In this article, we present models that decouple the steps of lexical selection and lexical reordering with the aim of minimizing the role of word-alignment in machine translation. Indian languages are morphologically rich and have relatively free-word order where the grammatical role of content words is largely determined by their case markers and not just by their positions in the sentence. Hence, lexical selection plays a far greater role than lexical reordering. For lexical selection, we investigate models that take the entire source sentence into account and evaluate their performance for English-Hindi translation in a tourism domain.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "global lexical selection; machine translation", } @Article{Tsunakawa:2009:CJL, author = "Takashi Tsunakawa and Naoaki Okazaki and Xiao Liu and Jun'ichi Tsujii", title = "A {Chinese--Japanese} Lexical Machine Translation through a Pivot Language", journal = j-TALIP, volume = "8", number = "2", pages = "9:1--9:??", month = may, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1526252.1526257", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The bilingual lexicon is an expensive but critical resource for multilingual applications in natural language processing. This article proposes an integrated framework for building a bilingual lexicon between the Chinese and Japanese languages. Since the language pair Chinese--Japanese does not include English, which is a central language of the world, few large-scale bilingual resources between Chinese and Japanese have been constructed. One solution to alleviate this problem is to build a Chinese--Japanese bilingual lexicon through English as the pivot language. In addition to the pivotal approach, we can make use of the characteristics of Chinese and Japanese languages that use Han characters. We incorporate a translation model obtained from a small Chinese--Japanese lexicon and use the similarity of the hanzi and kanji characters by using the log-linear model. Our experimental results show that the use of the pivotal approach can improve the translation performance over the translation model built from a small Chinese--Japanese lexicon. The results also demonstrate that the similarity between the hanzi and kanji characters provides a positive effect for translating technical terms.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "bilingual lexicon; Han characters; hanzi; kanji; pivot language; statistical machine translation", } @Article{Chen:2009:USD, author = "Wenliang Chen and Daisuke Kawahara and Kiyotaka Uchimoto and Yujie Zhang and Hitoshi Isahara", title = "Using Short Dependency Relations from Auto-Parsed Data for {Chinese} Dependency Parsing", journal = j-TALIP, volume = "8", number = "3", pages = "10:1--10:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1568292.1568293", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:08 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Dependency parsing has become increasingly popular for a surge of interest lately for applications such as machine translation and question answering. Currently, several supervised learning methods can be used for training high-performance dependency parsers if sufficient labeled data are available.\par However, currently used statistical dependency parsers provide poor results for words separated by long distances. In order to solve this problem, this article presents an effective dependency parsing approach of incorporating short dependency information from unlabeled data. The unlabeled data is automatically parsed by using a deterministic dependency parser, which exhibits a relatively high performance for short dependencies between words. We then train another parser that uses the information on short dependency relations extracted from the output of the first parser. The proposed approach achieves an unlabeled attachment score of 86.52\%, an absolute 1.24\% improvement over the baseline system on the Chinese Treebank data set. The results indicate that the proposed approach improves the parsing performance for longer distance words.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Chinese dependency parsing; semi-supervised learning; unlabeled data", } @Article{Chanda:2009:WWT, author = "Sukalpa Chanda and Umapada Pal and Oriol Ramos Terrades", title = "Word-Wise {Thai} and {Roman} Script Identification", journal = j-TALIP, volume = "8", number = "3", pages = "11:1--11:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1568292.1568294", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:08 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In some Thai documents, a single text line of a printed document page may contain words of both Thai and Roman scripts. For the Optical Character Recognition (OCR) of such a document page it is better to identify, at first, Thai and Roman script portions and then to use individual OCR systems of the respective scripts on these identified portions. In this article, an SVM-based method is proposed for identification of word-wise printed Roman and Thai scripts from a single line of a document page. Here, at first, the document is segmented into lines and then lines are segmented into character groups (words). In the proposed scheme, we identify the script of a character group combining different character features obtained from structural shape, profile behavior, component overlapping information, topological properties, and water reservoir concept, etc. Based on the experiment on 10,000 data (words) we obtained 99.62\% script identification accuracy from the proposed scheme.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Multi-script OCR; script identification; SVM; Thai Script", } @Article{Nguyen:2009:WSC, author = "Cam-Tu Nguyen and Xuan-Hieu Phan and Susumu Horiguchi and Thu-Trang Nguyen and Quang-Thuy Ha", title = "{Web} Search Clustering and Labeling with Hidden Topics", journal = j-TALIP, volume = "8", number = "3", pages = "12:1--12:??", month = aug, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1568292.1568295", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:08 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Web search clustering is a solution to reorganize search results (also called ``snippets'') in a more convenient way for browsing. There are three key requirements for such post-retrieval clustering systems: (1) the clustering algorithm should group similar documents together; (2) clusters should be labeled with descriptive phrases; and (3) the clustering system should provide high-quality clustering without downloading the whole Web page.\par This article introduces a novel framework for clustering Web search results in Vietnamese which targets the three above issues. The main motivation is that by enriching short snippets with hidden topics from huge resources of documents on the Internet, it is able to cluster and label such snippets effectively in a topic-oriented manner without concerning whole Web pages. Our approach is based on recent successful topic analysis models, such as Probabilistic-Latent Semantic Analysis, or Latent Dirichlet Allocation. The underlying idea of the framework is that we collect a very large external data collection called ``universal dataset,'' and then build a clustering system on both the original snippets and a rich set of hidden topics discovered from the universal data collection. This can be seen as a richer representation of snippets to be clustered. We carry out careful evaluation of our method and show that our method can yield impressive clustering quality.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "cluster labeling; collocation; hidden topics analysis; Hierarchical Agglomerative Clustering; Latent Dirichlet allocation; Vietnamese; Web search clustering", } @Article{Shaalan:2009:ISI, author = "K. Shaalan and A. Farghaly", title = "Introduction to the Special Issue on {Arabic} Natural Language Processing", journal = j-TALIP, volume = "8", number = "4", pages = "13:1--13:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644880", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Farghaly:2009:ANL, author = "Ali Farghaly and Khaled Shaalan", title = "{Arabic} Natural Language Processing: Challenges and Solutions", journal = j-TALIP, volume = "8", number = "4", pages = "14:1--14:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644881", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Arabic language presents researchers and developers of natural language processing (NLP) applications for Arabic text and speech with serious challenges. The purpose of this article is to describe some of these challenges and to present some solutions that would guide current and future practitioners in the field of Arabic natural language processing (ANLP). We begin with general features of the Arabic language in Sections 1, 2, and 3 and then we move to more specific properties of the language in the rest of the article. In Section 1 of this article we highlight the significance of the Arabic language today and describe its general properties. Section 2 presents the feature of Arabic Diglossia showing how the sociolinguistic aspects of the Arabic language differ from other languages. The stability of Arabic Diglossia and its implications for ANLP applications are discussed and ways to deal with this problematic property are proposed. Section 3 deals with the properties of the Arabic script and the explosion of ambiguity that results from the absence of short vowel representations and overt case markers in contemporary Arabic texts. We present in Section 4 specific features of the Arabic language such as the nonconcatenative property of Arabic morphology, Arabic as an agglutinative language, Arabic as a pro-drop language, and the challenge these properties pose to ANLP. We also present solutions that have already been adopted by some pioneering researchers in the field. In Section 5 we point out to the lack of formal and explicit grammars of Modern Standard Arabic which impedes the progress of more advanced ANLP systems. In Section 6 we draw our conclusion.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic dialects; Arabic script; Modern Standard Arabic", } @Article{Espana-Bonet:2009:DPB, author = "Cristina Espa{\~n}a-Bonet and Jes{\'u}s Gim{\'e}nez and Llu{\'\i}s M{\`a}rquez", title = "Discriminative Phrase-Based Models for {Arabic} Machine Translation", journal = j-TALIP, volume = "8", number = "4", pages = "15:1--15:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644882", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "A design for an Arabic-to-English translation system is presented. The core of the system implements a standard phrase-based statistical machine translation architecture, but it is extended by incorporating a local discriminative phrase selection model to address the semantic ambiguity of Arabic. Local classifiers are trained using linguistic information and context to translate a phrase, and this significantly increases the accuracy in phrase selection with respect to the most frequent translation traditionally considered. These classifiers are integrated into the translation system so that the global task gets benefits from the discriminative learning. As a result, we obtain significant improvements in the full translation task at the lexical, syntactic, and semantic levels as measured by an heterogeneous set of automatic evaluation metrics.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic; discriminative learning; English; statistical machine translation", } @Article{Benajiba:2009:MBS, author = "Yassine Benajiba and Imed Zitouni", title = "Morphology-Based Segmentation Combination for {Arabic} Mention Detection", journal = j-TALIP, volume = "8", number = "4", pages = "16:1--16:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644883", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Arabic language has a very rich/complex morphology. Each Arabic word is composed of zero or more {\em prefixes}, one {\em stem\/} and zero or more {\em suffixes}. Consequently, the Arabic data is sparse compared to other languages such as English, and it is necessary to conduct word segmentation before any natural language processing task. Therefore, the word-segmentation step is worth a deeper study since it is a preprocessing step which shall have a significant impact on all the steps coming afterward. In this article, we present an Arabic mention detection system that has very competitive results in the recent Automatic Content Extraction (ACE) evaluation campaign. We investigate the impact of different segmentation schemes on Arabic mention detection systems and we show how these systems may benefit from more than one segmentation scheme. We report the performance of several mention detection models using different kinds of possible and known segmentation schemes for Arabic text: punctuation separation, Arabic Treebank, and morphological and character-level segmentations. We show that the combination of competitive segmentation styles leads to a better performance. Results indicate a statistically significant improvement when Arabic Treebank and morphological segmentations are combined.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic information extraction; Arabic mention detection; Arabic segmentation", } @Article{Zitouni:2009:CLI, author = "Imed Zitouni and Radu Florian", title = "Cross-Language Information Propagation for {Arabic} Mention Detection", journal = j-TALIP, volume = "8", number = "4", pages = "17:1--17:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644884", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In the last two decades, significant effort has been put into annotating linguistic resources in several languages. Despite this valiant effort, there are still many languages left that have only small amounts of such resources. The goal of this article is to present and investigate a method of propagating information (specifically mention detection) from a resource-rich language into a relatively resource-poor language such as Arabic. Part of the investigation is to quantify the contribution of propagating information in different conditions based on the availability of resources in the target language. Experiments on the language pair Arabic-English show that one can achieve relatively decent performance by propagating information from a language with richer resources such as English into Arabic alone (no resources or models in the source language Arabic). Furthermore, results show that propagated features from English do help improve the Arabic system performance even when used in conjunction with all feature types built from the source language. Experiments also show that using propagated features in conjunction with lexically derived features only (as can be obtained directly from a mention annotated corpus) brings the system performance at the one obtained in the target language by using feature derived from many linguistic resources, therefore improving the system when such resources are not available. In addition to Arabic-English language pair, we investigate the effectiveness of our approach on other language pairs such as Chinese--English and Spanish--English.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic information extraction; Arabic mention detection", } @Article{Lamel:2009:AST, author = "Lori Lamel and Abdelkhalek Messaoudi and Jean-Luc Gauvain", title = "Automatic Speech-to-Text Transcription in {Arabic}", journal = j-TALIP, volume = "8", number = "4", pages = "18:1--18:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644885", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Arabic language presents a number of challenges for speech recognition, arising in part from the significant differences in the spoken and written forms, in particular the conventional form of texts being non-vowelized. Being a highly inflected language, the Arabic language has a very large lexical variety and typically with several possible (generally semantically linked) vowelizations for each written form. This article summarizes research carried out over the last few years on speech-to-text transcription of broadcast data in Arabic. The initial research was oriented toward processing of broadcast news data in Modern Standard Arabic, and has since been extended to address a larger variety of broadcast data, which as a consequence results in the need to also be able to handle dialectal speech. While standard techniques in speech recognition have been shown to apply well to the Arabic language, taking into account language specificities help to significantly improve system performance.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic language processing; automatic speech recognition; mophological decomposition; speech processing; speech-to-text transcription", } @Article{Moisl:2009:SLL, author = "Hermann Moisl", title = "Sura Length and Lexical Probability Estimation in Cluster Analysis of the {Qur'an}", journal = j-TALIP, volume = "8", number = "4", pages = "19:1--19:??", month = dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1644879.1644886", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:37:17 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Thabet [2005] applied cluster analysis to the Qur'an in the hope of generating a classification of the (suras) that is useful for understanding of its thematic structure. The result was positive, but variation in (sura) length was a problem because clustering of the shorter was found to be unreliable. The present discussion addresses this problem in four parts. The first part summarizes Thabet's work. The second part argues that unreliable clustering of the shorter is a consequence of poor estimation of lexical population probabilities in those. The third part proposes a solution to the problem based on calculation of a minimum length threshold using concepts from statistical sampling theory followed by selection of and lexical variables based on that threshold. The fourth part applies the proposed solution to a reanalysis of the Qur'an.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Arabic natural language processing; cluster analysis; document length normalization; lexical probability estimation; Qur'an sampling", } @Article{Hsu:2010:MST, author = "Chung-Chian Hsu and Chien-Hsing Chen", title = "Mining Synonymous Transliterations from the {World Wide Web}", journal = j-TALIP, volume = "9", number = "1", pages = "1:1--1:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1731035.1731036", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:34:01 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The World Wide Web has been considered one of the important sources for information. Using search engines to retrieve Web pages can gather lots of information, including foreign information. However, to be better understood by local readers, proper names in a foreign language, such as English, are often transliterated to a local language such as Chinese. Due to different translators and the lack of translation standard, translating foreign proper nouns may result in different transliterations and pose a notorious headache. In particular, it may cause incomplete search results. Using one transliteration as a query keyword will fail to retrieve the Web pages which use a different word as the transliteration. Consequently, important information may be missed. We present a framework for mining synonymous transliterations as many as possible from the Web for a given transliteration. The results can be used to construct a database of synonymous transliterations which can be utilized for query expansion so as to alleviate the incomplete search problem. Experimental results show that the proposed framework can effectively retrieve the set of snippets which may contain synonymous transliterations and then extract the target terms. Most of the extracted synonymous transliterations have higher rank of similarity to the input transliteration compared to other noise terms.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Chinese transliteration; cross-lingual information retrieval; synonymous transliteration; text mining; Web mining", } @Article{Liu:2010:ISS, author = "Feifan Liu and Yang Liu", title = "Identification of Soundbite and Its Speaker Name Using Transcripts of Broadcast News Speech", journal = j-TALIP, volume = "9", number = "1", pages = "2:1--2:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1731035.1731037", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:34:01 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a pipeline framework for identifying soundbite and its speaker name from Mandarin broadcast news transcripts. Both of the two modules, soundbite segment detection and soundbite speaker name recognition, are based on a supervised classification approach using multiple linguistic features. We systematically evaluated performance for each module as well as the entire system, and investigated the effect of using speech recognition (ASR) output and automatic sentence segmentation. We found that both of the two components impact the pipeline system, with more degradation in the entire system performance due to automatic speaker name recognition errors than soundbite segment detection. In addition, our experimental results show that using ASR output degrades the system performance significantly, and that using automatic sentence segmentation greatly impacts soundbite detection, but has much less effect on speaker name recognition.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "automatic speech recognition; sentence segmentation; Soundbite detection; speaker name recognition", } @Article{Tepper:2010:IMU, author = "Michael Tepper and Fei Xia", title = "Inducing Morphemes Using Light Knowledge", journal = j-TALIP, volume = "9", number = "1", pages = "3:1--3:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1731035.1731038", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:34:01 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Allomorphic variation, or form variation among morphs with the same meaning, is a stumbling block to morphological induction (MI). To address this problem, we present a hybrid approach that uses a small amount of linguistic knowledge in the form of orthographic rewrite rules to help refine an existing MI-produced segmentation. Using rules, we derive underlying analyses of morphs---generalized with respect to contextual spelling differences---from an existing surface morph segmentation, and from these we learn a morpheme-level segmentation. To learn morphemes, we have extended the Morfessor segmentation algorithm [Creutz and Lagus 2004; 2005; 2006] by using rules to infer possible underlying analyses from surface segmentations. A segmentation produced by Morfessor Categories-MAP Software v. 0.9.2 is used as input to our procedure and as a baseline that we evaluate against. To suggest analyses for our procedure, a set of language-specific orthographic rules is needed. Our procedure has yielded promising improvements for English and Turkish over the baseline approach when tested on the Morpho Challenge 2005 and 2007 style evaluations. On the Morpho Challenge 2007 test evaluation, we report gains over the current best unsupervised contestant for Turkish, where our technique shows a 2.5\% absolute {\em F\/} -score improvement.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "allomorphy; computational linguistics; machine learning; Morphological induction", } @Article{Baldwin:2010:RMB, author = "Timothy Baldwin and Sunam Kim and Francis Bond and Sanae Fujita and David Martinez and Takaaki Tanaka", title = "A Reexamination of {MRD}-Based Word Sense Disambiguation", journal = j-TALIP, volume = "9", number = "1", pages = "4:1--4:??", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1731035.1731039", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Mar 29 15:34:01 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article reconsiders the task of MRD-based word sense disambiguation, in extending the basic Lesk algorithm to investigate the impact on WSD performance of different tokenization schemes and methods of definition extension. In experimentation over the Hinoki Sensebank and the Japanese Senseval-2 dictionary task, we demonstrate that sense-sensitive definition extension over hyponyms, hypernyms, and synonyms, combined with definition extension and word tokenization leads to WSD accuracy above both unsupervised and supervised baselines. In doing so, we demonstrate the utility of ontology induction and establish new opportunities for the development of baseline unsupervised WSD methods.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Japanese; machine-readable dictionary; word sense disambiguation", } @Article{Zhao:2010:UCB, author = "Hai Zhao and Chang-Ning Huang and Mu Li and Bao-Liang Lu", title = "A Unified Character-Based Tagging Framework for {Chinese} Word Segmentation", journal = j-TALIP, volume = "9", number = "2", pages = "5:1--5:??", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1781134.1781135", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 21 18:03:02 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Chinese word segmentation is an active area in Chinese language processing though it is suffering from the argument about what precisely is a word in Chinese. Based on corpus-based segmentation standard, we launched this study. In detail, we regard Chinese word segmentation as a character-based tagging problem. We show that there has been a potent trend of using a character-based tagging approach in this field. In particular, learning from segmented corpus with or without additional linguistic resources is treated in a unified way in which the only difference depends on how the feature template set is selected. It differs from existing work in that both feature template selection and tag set selection are considered in our approach, instead of the previous feature template focus only technique. We show that there is a significant performance difference as different tag sets are selected. This is especially applied to a six-tag set, which is good enough for most current segmented corpora. The linguistic meaning of a tag set is also discussed. Our results show that a simple learning system with six $n$-gram feature templates and a six-tag set can obtain competitive performance in the cases of learning only from a training corpus. In cases when additional linguistic resources are available, an ensemble learning technique, assistant segmenter, is proposed and its effectiveness is verified. Assistant segmenter is also proven to be an effective method as segmentation standard adaptation that outperforms existing ones. Based on the proposed approach, our system provides state-of-the-art performance in all 12 corpora of three international Chinese word segmentation bakeoffs.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "assistant segmenter; character-based tagging method; Chinese word segmentation; conditional random field; tag set selection", } @Article{Guo:2010:LIS, author = "Yuqing Guo and Haifeng Wang and Josef van Genabith", title = "A Linguistically Inspired Statistical Model for {Chinese} Punctuation Generation", journal = j-TALIP, volume = "9", number = "2", pages = "6:1--6:??", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1781134.1781136", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 21 18:03:02 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article investigates a relatively underdeveloped subject in natural language processing---the generation of punctuation marks. From a theoretical perspective, we study 16 Chinese punctuation marks as defined in the Chinese national standard of punctuation usage, and categorize these punctuation marks into three different types according to their syntactic properties. We implement a three-tier maximum entropy model incorporating linguistically-motivated features for generating the commonly used Chinese punctuation marks in unpunctuated sentences output by a surface realizer. Furthermore, we present a method to automatically extract cue words indicating sentence-final punctuation marks as a specialized feature to construct a more precise model. Evaluating on the Penn Chinese Treebank data, the MaxEnt model achieves an {\em f\/} -score of 79.83\% for punctuation insertion and 74.61\% for punctuation restoration using gold data input, 79.50\% for insertion and 73.32\% for restoration using parser-based imperfect input. The experiments show that the MaxEnt model significantly outperforms a baseline 5-gram language model that scores 54.99\% for punctuation insertion and 52.01\% for restoration. We show that our results are not far from human performance on the same task with human insertion {\em f\/} -scores in the range of 81-87\% and human restoration in the range of 71-82\%. Finally, a manual error analysis of the generation output shows that close to 40\% of the mismatched punctuation marks do in fact result in acceptable choices, a fact obscured in the automatic string-matching based evaluation scores.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Chinese punctuation marks; maximum entropy model; sentence realization", } @Article{Naptali:2010:TDL, author = "Welly Naptali and Masatoshi Tsuchiya and Seiichi Nakagawa", title = "Topic-Dependent Language Model with Voting on Noun History", journal = j-TALIP, volume = "9", number = "2", pages = "7:1--7:??", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1781134.1781137", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Jun 21 18:03:02 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Language models (LMs) are an important field of study in automatic speech recognition (ASR) systems. LM helps acoustic models find the corresponding word sequence of a given speech signal. Without it, ASR systems would not understand the language and it would be hard to find the correct word sequence. During the past few years, researchers have tried to incorporate long-range dependencies into statistical word-based $n$-gram LMs. One of these long-range dependencies is topic. Unlike words, topic is unobservable. Thus, it is required to find the meanings behind the words to get into the topic. This research is based on the belief that nouns contain topic information. We propose a new approach for a topic-dependent LM, where the topic is decided in an unsupervised manner. Latent Semantic Analysis (LSA) is employed to reveal hidden (latent) relations among nouns in the context words. To decide the topic of an event, a fixed size word history sequence (window) is observed, and voting is then carried out based on noun class occurrences weighted by a confidence measure. Experiments were conducted on an English corpus and a Japanese corpus: {\em The Wall Street Journal\/} corpus and {\em Mainichi Shimbun\/} (Japanese newspaper) corpus. The results show that our proposed method gives better perplexity than the comparative baselines, including a word-based/class-based $n$-gram LM, their interpolated LM, a cache-based LM, a topic-dependent LM based on $n$-gram, and a topic-dependent LM based on Latent Dirichlet Allocation (LDA). The {\em n\/} -best list rescoring was conducted to validate its application in ASR systems.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Language model; latent semantic analysis; perplexity; speech recognition; topic dependent", } @Article{Ng:2010:SJ, author = "Hwee Tou Ng", title = "The State of the Journal", journal = j-TALIP, volume = "9", number = "3", pages = "8:1--8:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838745.1838750", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Sep 18 15:58:58 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Harman:2010:ISI, author = "Donna Harman and Noriko Kando and Prasenjit Majumder and Mandar Mitra and Carol Peters", title = "Introduction to the {Special Issue on Indian Language Information Retrieval Part I}", journal = j-TALIP, volume = "9", number = "3", pages = "9:1--9:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838745.1838746", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Sep 18 15:58:58 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Majumder:2010:FEE, author = "Prasenjit Majumder and Mandar Mitra and Dipasree Pal and Ayan Bandyopadhyay and Samaresh Maiti and Sukomal Pal and Deboshree Modak and Sucharita Sanyal", title = "The {FIRE 2008} Evaluation Exercise", journal = j-TALIP, volume = "9", number = "3", pages = "10:1--10:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838745.1838747", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Sep 18 15:58:58 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The aim of the Forum for Information Retrieval Evaluation (FIRE) is to create an evaluation framework in the spirit of TREC (Text REtrieval Conference), CLEF (Cross-Language Evaluation Forum), and NTCIR (NII Test Collection for IR Systems), for Indian language Information Retrieval. The first evaluation exercise conducted by FIRE was completed in 2008. This article describes the test collections used at FIRE 2008, summarizes the approaches adopted by various participants, discusses the limitations of the datasets, and outlines the tasks planned for the next iteration of FIRE.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "evaluation; Indian languages; information retrieval", } @Article{Dolamic:2010:CSI, author = "Ljiljana Dolamic and Jacques Savoy", title = "Comparative Study of Indexing and Search Strategies for the {Hindi}, {Marathi}, and {Bengali} Languages", journal = j-TALIP, volume = "9", number = "3", pages = "11:1--11:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838745.1838748", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Sep 18 15:58:58 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The main goal of this article is to describe and evaluate various indexing and search strategies for the Hindi, Bengali, and Marathi languages. These three languages are ranked among the world's 20 most spoken languages and they share similar syntax, morphology, and writing systems. In this article we examine these languages from an Information Retrieval (IR) perspective through describing the key elements of their inflectional and derivational morphologies, and suggest a light and more aggressive stemming approach based on them.\par In our evaluation of these stemming strategies we make use of the FIRE 2008 test collections, and then to broaden our comparisons we implement and evaluate two language independent indexing methods: the $n$-gram and trunc-$n$ (truncation of the first $n$ letters). We evaluate these solutions by applying our various IR models, including the Okapi, Divergence from Randomness (DFR) and statistical language models (LM) together with two classical vector-space approaches: {\em tf idf\/} and {\em Lnu-ltc}.\par Experiments performed with all three languages demonstrate that the I(n$_e$)C2 model derived from the Divergence from Randomness paradigm tends to provide the best mean average precision (MAP). Our own tests suggest that improved retrieval effectiveness would be obtained by applying more aggressive stemmers, especially those accounting for certain derivational suffixes, compared to those involving a light stemmer or ignoring this type of word normalization procedure. Comparisons between no stemming and stemming indexing schemes shows that performance differences are almost always statistically significant. When, for example, an aggressive stemmer is applied, the relative improvements obtained are $\approx$28\% for the Hindi language, $\approx$42\% for Marathi, and $\approx$18\% for Bengali, as compared to a no-stemming approach. Based on a comparison of word-based and language-independent approaches we find that the trunc-4 indexing scheme tends to result in performance levels statistically similar to those of an aggressive stemmer, yet better than the 4-gram indexing scheme. A query-by-query analysis reveals the reasons for this, and also demonstrates the advantage of applying a stemming or a trunc-4 indexing scheme.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "Bengali language; Hindi language; Indic languages; Marathi language; natural language processing with Indo-European languages; search engines for Asian languages; stemmer", } @Article{Leveling:2010:SWI, author = "Johannes Leveling and Gareth J. F. Jones", title = "Sub-Word Indexing and Blind Relevance Feedback for {English}, {Bengali}, {Hindi}, and {Marathi} {IR}", journal = j-TALIP, volume = "9", number = "3", pages = "12:1--12:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838745.1838749", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Sep 18 15:58:58 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Forum for Information Retrieval Evaluation (FIRE) provides document collections, topics, and relevance assessments for information retrieval (IR) experiments on Indian languages. Several research questions are explored in this article: (1) How to create a simple, language-independent corpus-based stemmer, (2) How to identify sub-words and which types of sub-words are suitable as indexing units, and (3) How to apply blind relevance feedback on sub-words and how feedback term selection is affected by the type of the indexing unit. More than 140 IR experiments are conducted using the BM25 retrieval model on the topic titles and descriptions (TD) for the FIRE 2008 English, Bengali, Hindi, and Marathi document collections.\par The major findings are: The corpus-based stemming approach is effective as a knowledge-light term conflation step and useful in the case of few language-specific resources. For English, the corpus-based stemmer performs nearly as well as the Porter stemmer and significantly better than the baseline of indexing words when combined with query expansion. In combination with blind relevance feedback, it also performs significantly better than the baseline for Bengali and Marathi IR.\par Sub-words such as consonant-vowel sequences and word prefixes can yield similar or better performance in comparison to word indexing. There is no best performing method for all languages. For English, indexing using the Porter stemmer performs best, for Bengali and Marathi, overlapping 3-grams obtain the best result, and for Hindi, 4-prefixes yield the highest MAP. However, in combination with blind relevance feedback using 10 documents and 20 terms, 6-prefixes for English and 4-prefixes for Bengali, Hindi, and Marathi IR yield the highest MAP.\par Sub-word identification is a general case of decompounding. It results in one or more index terms for a single word form and increases the number of index terms but decreases their average length. The corresponding retrieval experiments show that relevance feedback on sub-words benefits from selecting a larger number of index terms in comparison with retrieval on word forms. Similarly, selecting the number of relevance feedback terms depending on the ratio of word vocabulary size to sub-word vocabulary size almost always slightly increases information retrieval effectiveness compared to using a fixed number of terms for different languages.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", keywords = "blind relevance feedback; evaluation; FIRE; Information retrieval; stemming; sub-word indexing", } @Article{Kumaran:2010:CMT, author = "A. Kumaran and Mitesh M. Khapra and Pushpak Bhattacharyya", title = "Compositional Machine Transliteration", journal = j-TALIP, volume = "9", number = "4", pages = "13:1--13:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838751.1838752", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Dec 15 10:47:09 MST 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Machine transliteration is an important problem in an increasingly multilingual world, as it plays a critical role in many downstream applications, such as machine translation or crosslingual information retrieval systems. In this article, we propose compositional machine transliteration systems, where multiple transliteration components may be composed either to improve existing transliteration quality, or to enable transliteration functionality between languages even when no direct parallel names corpora exist between them. Specifically, we propose two distinct forms of composition: serial and parallel. Serial compositional system chains individual transliteration components, say, $X \rightarrow Y$ and $Y \rightarrow Z$ systems, to provide transliteration functionality, $X \rightarrow Z$.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chinnakotla:2010:TRS, author = "Manoj K. Chinnakotla and Om P. Damani and Avijit Satoskar", title = "Transliteration for Resource-Scarce Languages", journal = j-TALIP, volume = "9", number = "4", pages = "14:1--14:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838751.1838753", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Dec 15 10:47:09 MST 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Today, parallel corpus-based systems dominate the transliteration landscape. But the resource-scarce languages do not enjoy the luxury of large parallel transliteration corpus. For these languages, rule-based transliteration is the only viable option. In this article, we show that by properly harnessing the monolingual resources in conjunction with manually created rule base, one can achieve reasonable transliteration performance. We achieve this performance by exploiting the power of Character Sequence Modeling (CSM), which requires only monolingual resources. We present the results of our rule-based system for Hindi to English, English to Hindi, and Persian to English transliteration tasks. We also perform extrinsic evaluation of transliteration systems in the context of Cross Lingual Information Retrieval.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mukund:2010:IES, author = "Smruthi Mukund and Rohini Srihari and Erik Peterson", title = "An Information-Extraction System for {Urdu}---{A} Resource-Poor Language", journal = j-TALIP, volume = "9", number = "4", pages = "15:1--15:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838751.1838754", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Dec 15 10:47:09 MST 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "There has been an increase in the amount of multilingual text on the Internet due to the proliferation of news sources and blogs. The Urdu language, in particular, has experienced explosive growth on the Web. Text mining for information discovery, which includes tasks such as identifying topics, relationships and events, and sentiment analysis, requires sophisticated natural language processing (NLP). NLP systems begin with modules such as word segmentation, part-of-speech tagging, and morphological analysis and progress to modules such as shallow parsing and named entity tagging. While there have been considerable advances in developing such comprehensive NLP systems for English, the work for Urdu is still in its infancy.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Katz:2011:ISI, author = "Graham Katz and Mona Diab", title = "Introduction to the Special Issue on {Arabic} Computational Linguistics", journal = j-TALIP, volume = "10", number = "1", pages = "1:1--1:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929909", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Condon:2011:MTE, author = "S. Condon and D. Parvaz and J. Aberdeen and C. Doran and A. Freeman and M. Awad", title = "Machine Translation Errors: {English} and {Iraqi Arabic}", journal = j-TALIP, volume = "10", number = "1", pages = "2:1--2:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929910", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Errors in machine translations of English-Iraqi Arabic dialogues were analyzed using the methods developed for the Human Translation Error Rate measure (HTER). Human annotations were used to refine the Translation Error Rate (TER) annotations. The analyses were performed on approximately 100 translations into each language from four translation systems. Results include high frequencies of pronoun errors and errors involving the copula in translations to English. High frequencies of errors in subject/person inflection and closed-word classes characterized translations to Iraqi Arabic. There were similar frequencies of word order errors in both translation directions and low frequencies of polarity errors. The problems associated with many errors can be predicted from structural differences between the two languages.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Rytting:2011:SCD, author = "C. Anton Rytting and David M. Zajic and Paul Rodrigues and Sarah C. Wayland and Christian Hettick and Tim Buckwalter and Charles C. Blake", title = "Spelling Correction for Dialectal {Arabic} Dictionary Lookup", journal = j-TALIP, volume = "10", number = "1", pages = "3:1--3:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929911", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The ``Did You Mean\ldots{}?'' system, described in this article, is a spelling corrector for Arabic that is designed specifically for L2 learners of dialectal Arabic in the context of dictionary lookup. The authors use an orthographic density metric to motivate the need for a finer-grained ranking method for candidate words than unweighted Levenshtein edit distance. The Did You Mean\ldots{}? architecture is described, and the authors show that mean reciprocal rank can be improved by tuning operation weights according to sound confusions, and by anticipating likely spelling variants.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kulick:2011:ESC, author = "Seth Kulick", title = "Exploiting Separation of Closed-Class Categories for {Arabic} Tokenization and Part-of-Speech Tagging", journal = j-TALIP, volume = "10", number = "1", pages = "4:1--4:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929912", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Research on the problem of morphological disambiguation of Arabic has noted that techniques developed for lexical disambiguation in English do not easily transfer over, since the affixation present in Arabic creates a very different tag set than for English, encoding both inflectional morphology and more complex tokenization sequences. This work takes a new approach to this problem based on a distinction between the open-class and closed-class categories of tokens, which differ both in their frequencies and in their possible morphological affixations. This separation simplifies the morphological analysis problem considerably, making it possible to use a Conditional Random Field model for joint tokenization and ``core'' part-of-speech tagging of the open-class items, while the closed-class items are handled by regular expressions.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Abdul-Mageed:2011:ADA, author = "Muhammad Abdul-Mageed", title = "Automatic Detection of {Arabic} Non-Anaphoric Pronouns for Improving Anaphora Resolution", journal = j-TALIP, volume = "10", number = "1", pages = "5:1--5:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929913", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Anaphora resolution is one of the most difficult tasks in NLP. The ability to identify non-referential pronouns before attempting an anaphora resolution task would be significant, since the system would not have to attempt resolving such pronouns and hence end up with fewer errors. In addition, the number of non-referential pronouns has been found to be non-trivial in many domains. The task of detecting non-referential pronouns could also be incorporated into a part-of-speech tagger or a parser, or treated as an initial step in semantic interpretation. In this article, I describe a machine learning method for identifying non-referential pronouns in an annotated subsegment of the Penn Arabic Treebank using three different feature settings.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wu:2011:IPD, author = "Chung-Hsien Wu and Wei-Bin Liang and Jui-Feng Yeh", title = "Interruption Point Detection of Spontaneous Speech Using Inter-Syllable Boundary-Based Prosodic Features", journal = j-TALIP, volume = "10", number = "1", pages = "6:1--6:??", month = mar, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1929908.1929914", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Mar 16 18:07:50 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a probabilistic scheme for detecting the interruption point (IP) in spontaneous speech based on inter-syllable boundary-based prosodic features. Because of the high error rate in spontaneous speech recognition, a combined acoustic model considering both syllable and subsyllable recognition units, is firstly used to determine the inter-syllable boundaries and output the recognition confidence of the input speech. Based on the finding that IPs always occur at inter-syllable boundaries, a probability distribution of the prosodic features at the current potential IP is estimated. The Conditional Random Field (CRF) model, which employs the clustered prosodic features of the current potential IP and its preceding and succeeding inter-syllable boundaries, is employed to output the IP likelihood measure.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wu:2011:ADS, author = "Chung-Hsien Wu and Hung-Yu Su and Han-Ping Shen", title = "Articulation-Disordered Speech Recognition Using Speaker-Adaptive Acoustic Models and Personalized Articulation Patterns", journal = j-TALIP, volume = "10", number = "2", pages = "7:1--7:??", month = jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1967293.1967294", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 28 18:29:03 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a novel approach to speaker-adaptive recognition of speech from articulation-disordered speakers without a large amount of adaptation data. An unsupervised, incremental adaptation method is adopted for personalized model adaptation based on the recognized syllables with high recognition confidence from an automatic speech recognition (ASR) system. For articulation pattern discovery, the manually transcribed syllables and the corresponding recognized syllables are associated with each other using articulatory features. The Apriori algorithm is applied to discover the articulation patterns in the corpus, which are then used to construct a personalized pronunciation dictionary to improve the recognition accuracy of the ASR. The experimental results indicate that the proposed adaptation method achieves a syllable error rate reduction of 6.1\%, outperforming the conventional adaptation methods that have a syllable error rate reduction of 3.8\%.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Paik:2011:FCB, author = "Jiaul H. Paik and Swapan K. Parui", title = "A Fast Corpus-Based Stemmer", journal = j-TALIP, volume = "10", number = "2", pages = "8:1--8:??", month = jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1967293.1967295", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 28 18:29:03 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Stemming is a mechanism of word form normalization that transforms the variant word forms to their common root. In an Information Retrieval system, it is used to increase the system's performance, specifically the recall and desirably the precision. Although its usefulness is shown to be mixed in languages such as English, because morphologically complex languages stemming produces a significant performance improvement. A number of linguistic rule-based stemmers are available for most European languages which employ a set of rules to get back the root word from its variants. But for Indian languages which are highly inflectional in nature, devising a linguistic rule-based stemmer needs some additional resources which are not available.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Ekbal:2011:WVB, author = "Asif Ekbal and Sriparna Saha", title = "Weighted Vote-Based Classifier Ensemble for Named Entity Recognition: a Genetic Algorithm-Based Approach", journal = j-TALIP, volume = "10", number = "2", pages = "9:1--9:??", month = jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1967293.1967296", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 28 18:29:03 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this article, we report the search capability of Genetic Algorithm (GA) to construct a weighted vote-based classifier ensemble for Named Entity Recognition (NER). Our underlying assumption is that the reliability of predictions of each classifier differs among the various named entity (NE) classes. Thus, it is necessary to quantify the amount of voting of a particular classifier for a particular output class. Here, an attempt is made to determine the appropriate weights of voting for each class in each classifier using GA. The proposed technique is evaluated for four leading Indian languages, namely Bengali, Hindi, Telugu, and Oriya, which are all resource-poor in nature.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Liu:2011:VPS, author = "C.-L. Liu and M.-H. Lai and K.-W. Tien and Y.-H. Chuang and S.-H. Wu and C.-Y. Lee", title = "Visually and Phonologically Similar Characters in Incorrect {Chinese} Words: Analyses, Identification, and Applications", journal = j-TALIP, volume = "10", number = "2", pages = "10:1--10:??", month = jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1967293.1967297", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 28 18:29:03 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Information about students' mistakes opens a window to an understanding of their learning processes, and helps us design effective course work to help students avoid replication of the same errors. Learning from mistakes is important not just in human learning activities; it is also a crucial ingredient in techniques for the developments of student models. In this article, we report findings of our study on 4,100 erroneous Chinese words. Seventy-six percent of these errors were related to the phonological similarity between the correct and the incorrect characters, 46\% were due to visual similarity, and 29\% involved both factors. We propose a computing algorithm that aims at replication of incorrect Chinese words.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chen:2011:ISI, author = "Keh-Jiann Chen and Qun Liu and Nianwen Xue and Le Sun", title = "Introduction to the Special Issue on {Chinese} Language Processing", journal = j-TALIP, volume = "10", number = "3", pages = "11:1--11:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002981", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhu:2011:ATC, author = "Muhua Zhu and Jingbo Zhu and Tong Xiao", title = "Automatic Treebank Conversion via Informed Decoding --- {A} Case Study on {Chinese} Treebanks", journal = j-TALIP, volume = "10", number = "3", pages = "12:1--12:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002982", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Treebanks are valuable resources for syntactic parsing. For some languages such as Chinese, we can obtain multiple constituency treebanks which are developed by different organizations. However, due to discrepancies of underlying annotation standards, such treebanks in general cannot be used together through direct data combination. To enlarge training data for syntactic parsing, we focus in this article on the challenge of unifying standards of disparate treebanks by automatically converting one treebank (source treebank) to fit a different standard which is exhibited by another treebank (target treebank). We propose to convert a treebank in two sequential steps which correspond to the part-of-speech level and syntactic structure level (including tree structures and grammar labels), respectively.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2011:USR, author = "Junhui Li and Guodong Zhou", title = "Unified Semantic Role Labeling for Verbal and Nominal Predicates in the {Chinese} Language", journal = j-TALIP, volume = "10", number = "3", pages = "13:1--13:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002983", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article explores unified semantic role labeling (SRL) for both verbal and nominal predicates in the Chinese language. This is done by considering SRL for both verbal and nominal predicates in a unified framework. First, we systematically examine various kinds of features for verbal SRL and nominal SRL, respectively, besides those widely used ones. Then we further improve the performance of nominal SRL with various kinds of verbal evidence, that is, merging the training instances from verbal predicates and integrating various kinds of features derived from SRL for verbal predicates. Finally, we address the issue of automatic predicate recognition, which is essential for nominal SRL.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhang:2011:DPS, author = "Peng Zhang and Wenjie Li and Yuexian Hou and Dawei Song", title = "Developing Position Structure-Based Framework for {Chinese} Entity Relation Extraction", journal = j-TALIP, volume = "10", number = "3", pages = "14:1--14:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002984", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Relation extraction is the task of finding semantic relations between two entities in text, and is often cast as a classification problem. In contrast to the significant achievements on English language, research progress in Chinese relation extraction is relatively limited. In this article, we present a novel Chinese relation extraction framework, which is mainly based on a 9-position structure. The design of this proposed structure is motivated by the fact that there are some obvious connections between relation types/subtypes and position structures of two entities. The 9-position structure can be captured with less effort than applying deep natural language processing, and is effective to relieve the class imbalance problem which often hurts the classification performance.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Qian:2011:ECD, author = "Longhua Qian and Guodong Zhou and Qiaoming Zhu", title = "Employing Constituent Dependency Information for Tree Kernel-Based Semantic Relation Extraction between Named Entities", journal = j-TALIP, volume = "10", number = "3", pages = "15:1--15:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002985", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article proposes a new approach to dynamically determine the tree span for tree kernel-based semantic relation extraction between named entities. The basic idea is to employ constituent dependency information in keeping the necessary nodes and their head children along the path connecting the two entities in the syntactic parse tree, while removing the noisy information from the tree, eventually leading to a dynamic syntactic parse tree. This article also explores various entity features and their possible combinations via a unified syntactic and semantic tree framework, which integrates both structural syntactic parse information and entity-related semantic information. Evaluation on the ACE RDC 2004 English and 2005 Chinese benchmark corpora shows that our dynamic syntactic parse tree much outperforms all previous tree spans, indicating its effectiveness in well representing the structural nature of relation instances while removing redundant information.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Huang:2011:UST, author = "Chung-Chi Huang and Ho-Ching Yen and Ping-Che Yang and Shih-Ting Huang and Jason S. Chang", title = "Using Sublexical Translations to Handle the {OOV} Problem in Machine Translation", journal = j-TALIP, volume = "10", number = "3", pages = "16:1--16:??", month = sep, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2002980.2002986", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Sep 9 15:01:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We introduce a method for learning to translate out-of-vocabulary (OOV) words. The method focuses on combining sublexical/constituent translations of an OOV to generate its translation candidates. In our approach, wildcard searches are formulated based on our OOV analysis, aimed at maximizing the probability of retrieving OOVs' sublexical translations from existing resources of Machine Translation (MT) systems. At run-time, translation candidates of the unknown words are generated from their suitable sublexical translations and ranked based on monolingual and bilingual information.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Du:2011:ICE, author = "Jinhua Du and Andy Way", title = "Improved {Chinese--English} {SMT} with {Chinese} {``DE'}' Construction Classification and Reordering", journal = j-TALIP, volume = "10", number = "4", pages = "17:1--17:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2025384.2025385", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 15 09:23:26 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Syntactic reordering on the source side has been demonstrated to be helpful and effective for handling different word orders between source and target languages in SMT. In this article, we focus on the Chinese (DE) construction which is flexible and ubiquitous in Chinese and has many different ways to be translated into English so that it is a major source of word order differences in terms of translation quality. This article carries out the Chinese ``DE'' construction study for Chinese--English SMT in which we propose a new classifier model---discriminative latent variable model (DPLVM)---with new features to improve the classification accuracy and indirectly improve the translation quality compared to a log-linear classifier.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Xiao:2011:LMS, author = "Tong Xiao and Jingbo Zhu and Muhua Zhu", title = "Language Modeling for {Syntax-Based} Machine Translation Using Tree Substitution Grammars: a Case Study on {Chinese-English} Translation", journal = j-TALIP, volume = "10", number = "4", pages = "18:1--18:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2025384.2025386", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 15 09:23:26 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The poor grammatical output of Machine Translation (MT) systems appeals syntax-based approaches within language modeling. However, previous studies showed that syntax-based language modeling using (Context-Free) Treebank Grammars was not very helpful in improving BLEU scores for Chinese-English machine translation. In this article we further study this issue in the context of Chinese-English syntax-based Statistical Machine Translation (SMT) where Synchronous Tree Substitution Grammars (STSGs) are utilized to model the translation process. In particular, we develop a Tree Substitution Grammar-based language model for syntax-based MT, and present three methods to efficiently integrate the proposed language model into MT decoding. In addition, we design a simple and effective method to adapt syntax-based language models for MT tasks.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Li:2011:MEC, author = "Lishuang Li and Peng Wang and Degen Huang and Lian Zhao", title = "Mining {English--Chinese} Named Entity Pairs from Comparable Corpora", journal = j-TALIP, volume = "10", number = "4", pages = "19:1--19:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2025384.2025387", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 15 09:23:26 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Bilingual Named Entity (NE) pairs are valuable resources for many NLP applications. Since comparable corpora are more accessible, abundant and up-to-date, recent researches have concentrated on mining bilingual lexicons using comparable corpora. Leveraging comparable corpora, this research presents a novel approach to mining English-Chinese NE translations by combining multi-dimension features from various information sources for every possible NE pair, which include the transliteration model, English-Chinese matching, Chinese-English matching, translation model, length, and context vector. These features are integrated into one model with linear combination and minimum sample risk (MSR) algorithm. As for the high type-dependence of NE translation, we integrate different features according to different NE types.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Liu:2011:UBR, author = "Zhiyuan Liu and Yabin Zheng and Lixing Xie and Maosong Sun and Liyun Ru and Yang Zhang", title = "User Behaviors in Related Word Retrieval and New Word Detection: a Collaborative Perspective", journal = j-TALIP, volume = "10", number = "4", pages = "20:1--20:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2025384.2025388", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 15 09:23:26 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Nowadays, user behavior analysis and collaborative filtering have drawn a large body of research in the machine learning community. The goal is either to enhance the user experience or discover useful information hidden in the data. In this article, we conduct extensive experiments on a Chinese input method data set, which keeps the word lists that users have used. Then, from the collaborative perspective, we aim to solve two tasks in natural language processing, that is, related word retrieval and new word detection. Motivated by the observation that two words are usually highly related to each other if they co-occur frequently in users' records, we propose a novel semantic relatedness measure between words that takes both user behaviors and collaborative filtering into consideration.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wang:2011:DLA, author = "Baoxun Wang and Bingquan Liu and Xiaolong Wang and Chengjie Sun and Deyuan Zhang", title = "Deep Learning Approaches to Semantic Relevance Modeling for {Chinese} Question--Answer Pairs", journal = j-TALIP, volume = "10", number = "4", pages = "21:1--21:??", month = dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2025384.2025389", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 15 09:23:26 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The human-generated question-answer pairs in the Web social communities are of great value for the research of automatic question-answering technique. Due to the large amount of noise information involved in such corpora, it is still a problem to detect the answers even though the questions are exactly located. Quantifying the semantic relevance between questions and their candidate answers is essential to answer detection in social media corpora. Since both the questions and their answers usually contain a small number of sentences, the relevance modeling methods have to overcome the problem of word feature sparsity. In this article, the deep learning principle is introduced to address the semantic relevance modeling task.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Pal:2012:HRI, author = "Umapada Pal and Ramachandran Jayadevan and Nabin Sharma", title = "Handwriting Recognition in {Indian} Regional Scripts: a Survey of Offline Techniques", journal = j-TALIP, volume = "11", number = "1", pages = "1:1--1:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2090176.2090177", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Mar 1 16:54:10 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Offline handwriting recognition in Indian regional scripts is an interesting area of research as almost 460 million people in India use regional scripts. The nine major Indian regional scripts are Bangla (for Bengali and Assamese languages), Gujarati, Kannada, Malayalam, Oriya, Gurumukhi (for Punjabi language), Tamil, Telugu, and Nastaliq (for Urdu language). A state-of-the-art survey about the techniques available in the area of offline handwriting recognition (OHR) in Indian regional scripts will be of a great aid to the researchers in the subcontinent and hence a sincere attempt is made in this article to discuss the advancements reported in this regard during the last few decades.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zaghouani:2012:RRB, author = "Wajdi Zaghouani", title = "{RENAR}: a Rule-Based {Arabic} Named Entity Recognition System", journal = j-TALIP, volume = "11", number = "1", pages = "2:1--2:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2090176.2090178", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Mar 1 16:54:10 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Named entity recognition has served many natural language processing tasks such as information retrieval, machine translation, and question answering systems. Many researchers have addressed the name identification issue in a variety of languages and recently some research efforts have started to focus on named entity recognition for the Arabic language. We present a working Arabic information extraction (IE) system that is used to analyze large volumes of news texts every day to extract the named entity (NE) types person, organization, location, date, and number, as well as quotations (direct reported speech) by and about people. The named entity recognition (NER) system was not developed for Arabic, but instead a multilingual NER system was adapted to also cover Arabic.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chang:2012:EDC, author = "Ru-Yng Chang and Chung-Hsien Wu and Philips Kokoh Prasetyo", title = "Error Diagnosis of {Chinese} Sentences Using Inductive Learning Algorithm and Decomposition-Based Testing Mechanism", journal = j-TALIP, volume = "11", number = "1", pages = "3:1--3:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2090176.2090179", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Mar 1 16:54:10 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This study presents a novel approach to error diagnosis of Chinese sentences for Chinese as second language (CSL) learners. A penalized probabilistic First-Order Inductive Learning (pFOIL) algorithm is presented for error diagnosis of Chinese sentences. The pFOIL algorithm integrates inductive logic programming (ILP), First-Order Inductive Learning (FOIL), and a penalized log-likelihood function for error diagnosis. This algorithm considers the uncertain, imperfect, and conflicting characteristics of Chinese sentences to infer error types and produce human-interpretable rules for further error correction. In a pFOIL algorithm, relation pattern background knowledge and quantized t-score background knowledge are proposed to characterize a sentence and then used for likelihood estimation.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{He:2012:ISP, author = "Yulan He", title = "Incorporating Sentiment Prior Knowledge for Weakly Supervised Sentiment Analysis", journal = j-TALIP, volume = "11", number = "2", pages = "4:1--4:??", month = jun, year = "2012", DOI = "https://doi.org/10.1145/2184436.2184437", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 12 11:20:16 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents two novel approaches for incorporating sentiment prior knowledge into the topic model for weakly supervised sentiment analysis where sentiment labels are considered as topics. One is by modifying the Dirichlet prior for topic-word distribution (LDA-DP), the other is by augmenting the model objective function through adding terms that express preferences on expectations of sentiment labels of the lexicon words using generalized expectation criteria (LDA-GE). We conducted extensive experiments on English movie review data and multi-domain sentiment dataset as well as Chinese product reviews about mobile phones, digital cameras, MP3 players, and monitors. The results show that while both LDA-DP and LDA-GE perform comparably to existing weakly supervised sentiment classification algorithms, they are much simpler and computationally efficient, rendering them more suitable for online and real-time sentiment classification on the Web. We observed that LDA-GE is more effective than LDA-DP, suggesting that it should be preferred when considering employing the topic model for sentiment analysis. Moreover, both models are able to extract highly domain-salient polarity words from text.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing (TALIP)", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wang:2012:TUF, author = "Hongling Wang and Guodong Zhou", title = "Toward a Unified Framework for Standard and Update Multi-Document Summarization", journal = j-TALIP, volume = "11", number = "2", pages = "5:1--5:??", month = jun, year = "2012", DOI = "https://doi.org/10.1145/2184436.2184438", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 12 11:20:16 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a unified framework for extracting standard and update summaries from a set of documents. In particular, a topic modeling approach is employed for salience determination and a dynamic modeling approach is proposed for redundancy control. In the topic modeling approach for salience determination, we represent various kinds of text units, such as word, sentence, document, documents, and summary, using a single vector space model via their corresponding probability distributions over the inherent topics of given documents or a related corpus. Therefore, we are able to calculate the similarity between any two text units via their topic probability distributions. In the dynamic modeling approach for redundancy control, we consider the similarity between the summary and the given documents, and the similarity between the sentence and the summary, besides the similarity between the sentence and the given documents, for standard summarization while for update summarization, we also consider the similarity between the sentence and the history documents or summary. Evaluation on TAC 2008 and 2009 in English language shows encouraging results, especially the dynamic modeling approach in removing the redundancy in the given documents. Finally, we extend the framework to Chinese multi-document summarization and experiments show the effectiveness of our framework.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing (TALIP)", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Andrade:2012:SEC, author = "Daniel Andrade and Takuya Matsuzaki and Jun'ichi Tsujii", title = "Statistical Extraction and Comparison of Pivot Words for Bilingual Lexicon Extension", journal = j-TALIP, volume = "11", number = "2", pages = "6:1--6:??", month = jun, year = "2012", DOI = "https://doi.org/10.1145/2184436.2184439", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 12 11:20:16 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Bilingual dictionaries can be automatically extended by new translations using comparable corpora. The general idea is based on the assumption that similar words have similar contexts across languages. However, previous studies have mainly focused on Indo-European languages, or use only a bag-of-words model to describe the context. Furthermore, we argue that it is helpful to extract only the statistically significant context, instead of using all context. The present approach addresses these issues in the following manner. First, based on the context of a word with an unknown translation (query word), we extract salient pivot words. Pivot words are words for which a translation is already available in a bilingual dictionary. For the extraction of salient pivot words, we use a Bayesian estimation of the point-wise mutual information to measure statistical significance. In the second step, we match these pivot words across languages to identify translation candidates for the query word. We therefore calculate a similarity score between the query word and a translation candidate using the probability that the same pivots will be extracted for both the query word and the translation candidate. The proposed method uses several context positions, namely, a bag-of-words of one sentence, and the successors, predecessors, and siblings with respect to the dependency parse tree of the sentence. In order to make these context positions comparable across Japanese and English, which are unrelated languages, we use several heuristics to adjust the dependency trees appropriately. We demonstrate that the proposed method significantly increases the accuracy of word translations, as compared to previous methods.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing (TALIP)", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Wang:2012:IGD, author = "Kun Wang and Chengqing Zong and Keh-Yih Su", title = "Integrating Generative and Discriminative Character-Based Models for {Chinese} Word Segmentation", journal = j-TALIP, volume = "11", number = "2", pages = "7:1--7:??", month = jun, year = "2012", DOI = "https://doi.org/10.1145/2184436.2184440", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Jun 12 11:20:16 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Among statistical approaches to Chinese word segmentation, the word-based n-gram ( generative ) model and the character-based tagging ( discriminative ) model are two dominant approaches in the literature. The former gives excellent performance for the in-vocabulary (IV) words; however, it handles out-of-vocabulary (OOV) words poorly. On the other hand, though the latter is more robust for OOV words, it fails to deliver satisfactory performance for IV words. These two approaches behave differently due to the unit they use (word vs. character) and the model form they adopt (generative vs. discriminative). In general, character-based approaches are more robust than word-based ones, as the vocabulary of characters is a closed set; and discriminative models are more robust than generative ones, since they can flexibly include all kinds of available information, such as future context. This article first proposes a character-based n -gram model to enhance the robustness of the generative approach. Then the proposed generative model is further integrated with the character-based discriminative model to take advantage of both approaches. Our experiments show that this integrated approach outperforms all the existing approaches reported in the literature. Afterwards, a complete and detailed error analysis is conducted. Since a significant portion of the critical errors is related to numerical/foreign strings, character-type information is then incorporated into the model to further improve its performance. Last, the proposed integrated approach is tested on cross-domain corpora, and a semi-supervised domain adaptation algorithm is proposed and shown to be effective in our experiments.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing (TALIP)", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Isozaki:2012:HBP, author = "Hideki Isozaki and Katsuhito Sudoh and Hajime Tsukada and Kevin Duh", title = "{HPSG}-Based Preprocessing for {English-to-Japanese} Translation", journal = j-TALIP, volume = "11", number = "3", pages = "8:1--8:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2334801.2334802", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Sep 11 14:17:04 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Japanese sentences have completely different word orders from corresponding English sentences. Typical phrase-based statistical machine translation (SMT) systems such as Moses search for the best word permutation within a given distance limit (distortion limit). For English-to-Japanese translation, we need a large distance limit to obtain acceptable translations, and the number of translation candidates is extremely large. Therefore, SMT systems often fail to find acceptable translations within a limited time. To solve this problem, some researchers use rule-based preprocessing approaches, which reorder English words just like Japanese by using dozens of rules. Our idea is based on the following two observations: (1) Japanese is a typical head-final language, and (2) we can detect heads of English sentences by a head-driven phrase structure grammar (HPSG) parser. The main contributions of this article are twofold: First, we demonstrate how off-the-shelf, state-of-the-art HPSG parser enables us to write the reordering rules in an abstract level and can easily improve the quality of English-to-Japanese translation. Second, we also show that syntactic heads achieve better results than semantic heads. The proposed method outperforms the best system of NTCIR-7 PATMT EJ task.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Zhang:2012:ABH, author = "Lidan Zhang and Kwop-Ping Chan", title = "Adaptive {Bayesian HMM} for Fully Unsupervised {Chinese} Part-of-Speech Induction", journal = j-TALIP, volume = "11", number = "3", pages = "9:1--9:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2334801.2334803", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Sep 11 14:17:04 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We propose an adaptive Bayesian hidden Markov model for fully unsupervised part-of-speech (POS) induction. The proposed model with its inference algorithm has two extensions to the first-order Bayesian HMM with Dirichlet priors. First our algorithm infers the optimal number of hidden states from the training corpus rather than fixes the dimensionality of state space beforehand. The second extension studies the Chinese unknown word processing module which measures similarities from both morphological properties and context distribution. Experimental results showed that both of these two extensions can help to find the optimal categories for Chinese in terms of both unsupervised clustering metrics and grammar induction accuracies on the Chinese Treebank.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Lee:2012:SMB, author = "Jinsik Lee and Sungjin Lee and Jonghoon Lee and Byeongchang Kim and Gary Geunbae Lee", title = "Stacking Model-Based {Korean} Prosodic Phrasing Using Speaker Variability Reduction and Linguistic Feature Engineering", journal = j-TALIP, volume = "11", number = "3", pages = "10:1--10:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2334801.2334804", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Sep 11 14:17:04 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents a prosodic phrasing model for a general purpose Korean speech synthesis system. To reflect the factors affecting prosodic phrasing in the model, linguistically motivated machine-learning features were investigated. These features were effectively incorporated using a stacking model. The phrasing performance was also improved through feature engineering. The corpus used in the experiment is a 4,392-sentence corpus (55,015 words with an average of 13 words per sentence). Because the corpus contains speaker-dependent variability and such variability is not appropriately reflected in a general purpose speech synthesis system, a method to reduce such variability is proposed. In addition, the entire set of data used in the experiment is provided to the public for future use in comparative research.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Duc:2012:CLL, author = "Nguyen Tuan Duc and Danushka Bollegala and Mitsuru Ishizuka", title = "Cross-Language Latent Relational Search between {Japanese} and {English} Languages Using a {Web} Corpus", journal = j-TALIP, volume = "11", number = "3", pages = "11:1--11:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2334801.2334805", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Tue Sep 11 14:17:04 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Latent relational search is a novel entity retrieval paradigm based on the proportional analogy between two entity pairs. Given a latent relational search query {(Japan, Tokyo), (France, ?)}, a latent relational search engine is expected to retrieve and rank the entity ``Paris'' as the first answer in the result list. A latent relational search engine extracts entities and relations between those entities from a corpus, such as the Web. Moreover, from some supporting sentences in the corpus, (e.g., ``Tokyo is the capital of Japan'' and ``Paris is the capital and biggest city of France''), the search engine must recognize the relational similarity between the two entity pairs. In cross-language latent relational search, the entity pairs as well as the supporting sentences of the first entity pair and of the second entity pair are in different languages. Therefore, the search engine must recognize similar semantic relations across languages. In this article, we study the problem of cross-language latent relational search between Japanese and English using Web data. To perform cross-language latent relational search in high speed, we propose a multi-lingual indexing method for storing entities and lexical patterns that represent the semantic relations extracted from Web corpora. We then propose a hybrid lexical pattern clustering algorithm to capture the semantic similarity between lexical patterns across languages. Using this algorithm, we can precisely measure the relational similarity between entity pairs across languages, thereby achieving high precision in the task of cross-language latent relational search. Experiments show that the proposed method achieves an MRR of 0.605 on Japanese-English cross-language latent relational search query sets and it also achieves a reasonable performance on the INEX Entity Ranking task.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Mitamura:2012:ISI, author = "Teruko Mitamura and Noriko Kando and Koichi Takeda", title = "Introduction to the Special Issue on {RITE}", journal = j-TALIP, volume = "11", number = "4", pages = "12:1--12:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382594", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Miyao:2012:ETE, author = "Yusuke Miyao and Hideki Shima and Hiroshi Kanayama and Teruko Mitamura", title = "Evaluating Textual Entailment Recognition for University Entrance Examinations", journal = j-TALIP, volume = "11", number = "4", pages = "13:1--13:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382595", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The present article addresses an attempt to apply questions in university entrance examinations to the evaluation of textual entailment recognition. Questions in several fields, such as history and politics, primarily test the examinee's knowledge in the form of choosing true statements from multiple choices. Answering such questions can be regarded as equivalent to finding evidential texts from a textbase such as textbooks and Wikipedia. Therefore, this task can be recast as recognizing textual entailment between a description in a textbase and a statement given in a question. We focused on the National Center Test for University Admission in Japan and converted questions into the evaluation data for textual entailment recognition by using Wikipedia as a textbase. Consequently, it is revealed that nearly half of the questions can be mapped into textual entailment recognition; 941 text pairs were created from 404 questions from six subjects. This data set is provided for a subtask of NTCIR RITE (Recognizing Inference in Text), and 16 systems from six teams used the data set for evaluation. The evaluation results revealed that the best system achieved a correct answer ratio of 56\%, which is significantly better than a random choice baseline.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Pham:2012:LRT, author = "Minh Quang Nhat Pham and Minh Le Nguyen and Akira Shimazu", title = "Learning to Recognize Textual Entailment in {Japanese} Texts with the Utilization of Machine Translation", journal = j-TALIP, volume = "11", number = "4", pages = "14:1--14:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382596", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Recognizing Textual Entailment (RTE) is a fundamental task in Natural Language Understanding. The task is to decide whether the meaning of a text can be inferred from the meaning of another one. In this article, we conduct an empirical study of recognizing textual entailment in Japanese texts, in which we adopt a machine learning-based approach to the task. We quantitatively analyze the effects of various entailment features, machine learning algorithms, and the impact of RTE resources on the performance of an RTE system. This article also investigates the use of machine translation for the RTE task and determines whether machine translation can be used to improve the performance of our RTE system. Experimental results achieved on benchmark data sets show that our machine learning-based RTE system outperforms the baseline methods based on lexical matching and syntactic matching. The results also suggest that the machine translation component can be utilized to improve the performance of the RTE system.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Qiu:2012:RIT, author = "Xipeng Qiu and Ling Cao and Zhao Liu and Xuanjing Huang", title = "Recognizing Inference in Texts with {Markov} Logic Networks", journal = j-TALIP, volume = "11", number = "4", pages = "15:1--15:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382597", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Recognizing inference in texts (RITE) attracts growing attention of natural language processing (NLP) researchers in recent years. In this article, we propose a novel approach to recognize inference with probabilistic logical reasoning. Our approach is built on Markov logic networks (MLNs) framework, which is a probabilistic extension of first-order logic. We design specific semantic rules based on the surface, syntactic, and semantic representations of texts, and map these rules to logical representations. We also extract information from some knowledge bases as common sense logic rules. Then we utilize MLNs framework to make predictions with combining statistical and logical reasoning. Experiment results shows that our system can achieve better performance than state-of-the-art RITE systems.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Shibata:2012:PAS, author = "Tomohide Shibata and Sadao Kurohashi", title = "Predicate-Argument Structure-Based Textual Entailment Recognition System Exploiting Wide-Coverage Lexical Knowledge", journal = j-TALIP, volume = "11", number = "4", pages = "16:1--16:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382598", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article proposes a predicate-argument structure based Textual Entailment Recognition system exploiting wide-coverage lexical knowledge. Different from conventional machine learning approaches where several features obtained from linguistic analysis and resources are utilized, our proposed method regards a predicate-argument structure as a basic unit, and performs the matching/alignment between a text and hypothesis. In matching between predicate-arguments, wide-coverage relations between words/phrases such as synonym and is-a are utilized, which are automatically acquired from a dictionary, Web corpus, and Wikipedia.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Shih:2012:VCT, author = "Chengwei Shih and Chengwei Lee and Richard Tzonghan Tsai and Wenlian Hsu", title = "Validating Contradiction in Texts Using Online Co-Mention Pattern Checking", journal = j-TALIP, volume = "11", number = "4", pages = "17:1--17:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382599", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Detecting contradictive statements is a foundational and challenging task for text understanding applications such as textual entailment. In this article, we aim to address the problem of the shortage of specific background knowledge in contradiction detection. A novel contradiction detecting approach based on the distribution of the query composed of critical mismatch combinations on the Internet is proposed to tackle the problem. By measuring the availability of mismatch conjunction phrases (MCPs), the background knowledge about two target statements can be implicitly obtained for identifying contradictions. Experiments on three different configurations show that the MCP-based approach achieves remarkable improvement on contradiction detection and can significantly improve the performance of textual entailment recognition.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Watanabe:2012:LDL, author = "Yotaro Watanabe and Junta Mizuno and Eric Nichols and Katsuma Narisawa and Keita Nabeshima and Naoaki Okazaki and Kentaro Inui", title = "Leveraging Diverse Lexical Resources for Textual Entailment Recognition", journal = j-TALIP, volume = "11", number = "4", pages = "18:1--18:??", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2382593.2382600", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Dec 6 07:40:55 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Since the problem of textual entailment recognition requires capturing semantic relations between diverse expressions of language, linguistic and world knowledge play an important role. In this article, we explore the effectiveness of different types of currently available resources including synonyms, antonyms, hypernym-hyponym relations, and lexical entailment relations for the task of textual entailment recognition. In order to do so, we develop an entailment relation recognition system which utilizes diverse linguistic analyses and resources to align the linguistic units in a pair of texts and identifies entailment relations based on these alignments. We use the Japanese subset of the NTCIR-9 RITE-1 dataset for evaluation and error analysis, conducting ablation testing and evaluation on hand-crafted alignment gold standard data to evaluate the contribution of individual resources. Error analysis shows that existing knowledge sources are effective for RTE, but that their coverage is limited, especially for domain-specific and other low-frequency expressions. To increase alignment coverage on such expressions, we propose a method of alignment inference that uses syntactic and semantic dependency information to identify likely alignments without relying on external resources. Evaluation adding alignment inference to a system using all available knowledge sources shows improvements in both precision and recall of entailment relation recognition.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Hao:2013:TPP, author = "Tianyong Hao and Chunshen Zhu", title = "Toward a Professional Platform for {Chinese} Character Conversion", journal = j-TALIP, volume = "12", number = "1", pages = "1:1--1:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2425327.2425328", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Mar 2 09:25:42 MST 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Increasing communication among Chinese-speaking regions using respectively traditional and simplified Chinese character systems has highlighted the subtle-yet-extensive differences between the two systems, which can lead to unexpected hindrance in converting characters from one to the other. This article proposes a new priority-based multi-data resources management model, with a new algorithm called Fused Conversion algorithm from Multi-Data resources (FCMD), to ensure more context-sensitive, human controllable, and thus more reliable conversions, by drawing on reverse maximum matching, n -gram-based statistical model and pattern-based learning and matching. After parameter training on the Tagged Chinese Gigaword corpus, its conversion precision reaches 91.5\% in context-sensitive cases, the most difficult part in the conversion, with an overall precision rate at 99.8\%, a significant improvement over the state-of-the-art models. The conversion platform based on the model has extra features such as data resource selection and $n$-grams self-learning ability, providing a more sophisticated tool good especially for high-end professional uses.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Jiang:2013:LRC, author = "Mike Tian-Jian Jiang and Tsung-Hsien Lee and Wen-Lian Hsu", title = "The Left and Right Context of a Word: Overlapping {Chinese} Syllable Word Segmentation with Minimal Context", journal = j-TALIP, volume = "12", number = "1", pages = "2:1--2:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2425327.2425329", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Mar 2 09:25:42 MST 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Since a Chinese syllable can correspond to many characters (homophones), the syllable-to-character conversion task is quite challenging for Chinese phonetic input methods (CPIM). There are usually two stages in a CPIM: 1. segment the syllable sequence into syllable words, and 2. select the most likely character words for each syllable word. A CPIM usually assumes that the input is a complete sentence, and evaluates the performance based on a well-formed corpus. However, in practice, most Pinyin users prefer progressive text entry in several short chunks, mainly in one or two words each (most Chinese words consist of two or more characters). Short chunks do not provide enough contexts to perform the best possible syllable-to-character conversion, especially when a chunk consists of overlapping syllable words. In such cases, a conversion system often selects the boundary of a word with the highest frequency. Short chunk input is even more popular on platforms with limited computing power, such as mobile phones. Based on the observation that the relative strength of a word can be quite different when calculated leftwards or rightwards, we propose a simple division of the word context into the left context and the right context. Furthermore, we design a double ranking strategy for each word to reduce the number of errors in Step 1. Our strategy is modeled as the minimum feedback arc set problem on bipartite tournament with approximate solutions derived from genetic algorithm. Experiments show that, compared to the frequency-based method (FBM) (low memory and fast) and the conditional random fields (CRF) model (larger memory and slower), our double ranking strategy has the benefits of less memory and low power requirement with competitive performance. We believe a similar strategy could also be adopted to disambiguate conflicting linguistic patterns effectively.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Bach:2013:TPF, author = "Ngo Xuan Bach and Nguyen Le Minh and Tran Thi Oanh and Akira Shimazu", title = "A Two-Phase Framework for Learning Logical Structures of Paragraphs in Legal Articles", journal = j-TALIP, volume = "12", number = "1", pages = "3:1--3:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2425327.2425330", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Mar 2 09:25:42 MST 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Analyzing logical structures of texts is important to understanding natural language, especially in the legal domain, where legal texts have their own specific characteristics. Recognizing logical structures in legal texts does not only help people in understanding legal documents, but also in supporting other tasks in legal text processing. In this article, we present a new task, learning logical structures of paragraphs in legal articles, which is studied in research on Legal Engineering. The goals of this task are recognizing logical parts of law sentences in a paragraph, and then grouping related logical parts into some logical structures of formulas, which describe logical relations between logical parts. We present a two-phase framework to learn logical structures of paragraphs in legal articles. In the first phase, we model the problem of recognizing logical parts in law sentences as a multi-layer sequence learning problem, and present a CRF-based model to recognize them. In the second phase, we propose a graph-based method to group logical parts into logical structures. We consider the problem of finding a subset of complete subgraphs in a weighted-edge complete graph, where each node corresponds to a logical part, and a complete subgraph corresponds to a logical structure. We also present an integer linear programming formulation for this optimization problem. Our models achieve 74.37\% in recognizing logical parts, 80.08\% in recognizing logical structures, and 58.36\% in the whole task on the Japanese National Pension Law corpus. Our work provides promising results for further research on this interesting task.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sundaram:2013:AFB, author = "Suresh Sundaram and A. G. Ramakrishnan", title = "Attention-Feedback Based Robust Segmentation of Online Handwritten Isolated {Tamil} Words", journal = j-TALIP, volume = "12", number = "1", pages = "4:1--4:??", month = mar, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2425327.2425331", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Mar 2 09:25:42 MST 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this article, we propose a lexicon-free, script-dependent approach to segment online handwritten isolated Tamil words into its constituent symbols. Our proposed segmentation strategy comprises two modules, namely the (1) Dominant Overlap Criterion Segmentation (DOCS) module and (2) Attention Feedback Segmentation (AFS) module. Based on a bounding box overlap criterion in the DOCS module, the input word is first segmented into stroke groups. A stroke group may at times correspond to a part of a valid symbol (over-segmentation) or a merger of valid symbols (under-segmentation). Attention on specific features in the AFS module serve in detecting possibly over-segmented or under-segmented stroke groups. Thereafter, feedbacks from the SVM classifier likelihoods and stroke-group based features are considered in modifying the suspected stroke groups to form valid symbols. The proposed scheme is tested on a set of 10000 isolated handwritten words (containing 53,246 Tamil symbols). The results show that the DOCS module achieves a symbol-level segmentation accuracy of 98.1\%, which improves to as high as 99.7\% after the AFS strategy. This in turn entails a symbol recognition rate of 83.9\% (at the DOCS module) and 88.4\% (after the AFS module). The resulting word recognition rates at the DOCS and AFS modules are found to be, 50.9\% and 64.9\% respectively, without any postprocessing.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sun:2013:LAC, author = "Xu Sun and Naoaki Okazaki and Jun'ichi Tsujii and Houfeng Wang", title = "Learning Abbreviations from {Chinese} and {English} Terms by Modeling Non-Local Information", journal = j-TALIP, volume = "12", number = "2", pages = "5:1--5:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2461316.2461317", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jun 6 06:48:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The present article describes a robust approach for abbreviating terms. First, in order to incorporate non-local information into abbreviation generation tasks, we present both implicit and explicit solutions: the latent variable model and the label encoding with global information. Although the two approaches compete with one another, we find they are also highly complementary. We propose a combination of the two approaches, and we will show the proposed method outperforms all of the existing methods on abbreviation generation datasets. In order to reduce computational complexity of learning non-local information, we further present an online training method, which can arrive the objective optimum with accelerated training speed. We used a Chinese newswire dataset and a English biomedical dataset for experiments. Experiments revealed that the proposed abbreviation generator with non-local information achieved the best results for both the Chinese and English languages.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Hinkle:2013:DES, author = "Lauren Hinkle and Albert Brouillette and Sujay Jayakar and Leigh Gathings and Miguel Lezcano and Jugal Kalita", title = "Design and Evaluation of Soft Keyboards for {Brahmic} Scripts", journal = j-TALIP, volume = "12", number = "2", pages = "6:1--6:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2461316.2461318", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jun 6 06:48:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Despite being spoken by a large percentage of the world, Indic languages in general lack user-friendly and efficient methods for text input. These languages have poor or no support for typing. Soft keyboards, because of their ease of installation and lack of reliance on specific hardware, are a promising solution as an input device for many languages. Developing an acceptable soft keyboard requires the frequency analysis of characters in order to design a layout that minimizes text-input time. This article proposes the use of various development techniques, layout variations, and evaluation methods for the creation of soft keyboards for Brahmic scripts. We propose that using optimization techniques such as genetic algorithms and multi-objective Pareto optimization to develop multi-layer keyboards will increase the speed at which text can be entered.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Fujita:2013:WSD, author = "Sanae Fujita and Akinori Fujino", title = "Word Sense Disambiguation by Combining Labeled Data Expansion and Semi-Supervised Learning Method", journal = j-TALIP, volume = "12", number = "2", pages = "7:1--7:??", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2461316.2461319", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Jun 6 06:48:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Lack of labeled data is one of the severest problems facing word sense disambiguation (WSD). We overcome the problem by proposing a method that combines automatic labeled data expansion (Step 1) and semi-supervised learning (Step 2). The Step 1 and 2 methods are both effective, but their combination yields a synergistic effect. In this article, in Step 1, we automatically extract reliable labeled data from raw corpora using dictionary example sentences, even the infrequent and unseen senses (which are not likely to appear in labeled data). Next, in Step 2, we apply a semi-supervised classifier and achieve an improvement using easy-to-get unlabeled data. In this step, we also show that we can guess even unseen senses. We target a SemEval-2010 Japanese WSD task, which is a lexical sample task. Both Step 1 and Step 2 methods performed better than the best published result (76.4 \%). Furthermore, the combined method achieved much higher accuracy (84.2 \%). In this experiment, up to 50 \% of unseen senses are classified correctly. However, the number of unseen senses are small, therefore, we delete one senses per word and apply our proposed method; the results show that the method is effective and robust even for unseen senses.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sproat:2013:EGN, author = "Richard Sproat", title = "Editorial Greetings from the new {Editor-in--Chief}", journal = j-TALIP, volume = "12", number = "3", pages = "8:1--8:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499955.2499956", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Aug 19 18:39:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Fukunishi:2013:BAA, author = "Takaaki Fukunishi and Andrew Finch and Seiichi Yamamoto and Eiichiro Sumita", title = "A {Bayesian} Alignment Approach to Transliteration Mining", journal = j-TALIP, volume = "12", number = "3", pages = "9:1--9:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499955.2499957", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Aug 19 18:39:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this article we present a technique for mining transliteration pairs using a set of simple features derived from a many-to-many bilingual forced-alignment at the grapheme level to classify candidate transliteration word pairs as correct transliterations or not. We use a nonparametric Bayesian method for the alignment process, as this process rewards the reuse of parameters, resulting in compact models that align in a consistent manner and tend not to over-fit. Our approach uses the generative model resulting from aligning the training data to force-align the test data. We rely on the simple assumption that correct transliteration pairs would be well modeled and generated easily, whereas incorrect pairs---being more random in character---would be more costly to model and generate. Our generative model generates by concatenating bilingual grapheme sequence pairs. The many-to-many generation process is essential for handling many languages with non-Roman scripts, and it is hard to train well using a maximum likelihood techniques, as these tend to over-fit the data. Our approach works on the principle that generation using only grapheme sequence pairs that are in the model results in a high probability derivation, whereas if the model is forced to introduce a new parameter in order to explain part of the candidate pair, the derivation probability is substantially reduced and severely reduced if the new parameter corresponds to a sequence pair composed of a large number of graphemes. The features we extract from the alignment of the test data are not only based on the scores from the generative model, but also on the relative proportions of each sequence that are hard to generate. The features are used in conjunction with a support vector machine classifier trained on known positive examples together with synthetic negative examples to determine whether a candidate word pair is a correct transliteration pair. In our experiments, we used all data tracks from the 2010 Named-Entity Workshop (NEWS'10) and use the performance of the best system for each language pair as a reference point. Our results show that the new features we propose are powerfully predictive, enabling our approach to achieve levels of performance on this task that are comparable to the state of the art.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Iwakura:2013:NER, author = "Tomoya Iwakura and Hiroya Takamura and Manabu Okumura", title = "A Named Entity Recognition Method Based on Decomposition and Concatenation of Word Chunks", journal = j-TALIP, volume = "12", number = "3", pages = "10:1--10:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499955.2499958", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Aug 19 18:39:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We propose a named entity (NE) recognition method in which word chunks are repeatedly decomposed and concatenated. Our method identifies word chunks with a base chunker, such as a noun phrase chunker, and then recognizes NEs from the recognized word chunk sequences. By using word chunks, we can obtain features that cannot be obtained in word-sequence-based recognition methods, such as the first word of a word chunk, the last word of a word chunk, and so on. However, each word chunk may include a part of an NE or multiple NEs. To solve this problem, we use the following operators: SHIFT for separating the first word from a word chunk, POP for separating the last word from a word chunk, JOIN for concatenating two word chunks, and REDUCE for assigning an NE label to a word chunk. We evaluate our method on a Japanese NE recognition dataset that includes about 200,000 annotations of 191 types of NEs from over 8,500 news articles. The experimental results show that the training and processing speeds of our method are faster than those of a linear-chain structured perceptron and a semi-Markov perceptron, while maintaining high accuracy.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Izumi:2013:NCF, author = "Tomoko Izumi and Kenji Imamura and Taichi Asami and Kuniko Saito and Genichiro Kikui and Satoshi Sato", title = "Normalizing Complex Functional Expressions in {Japanese} Predicates: Linguistically-Directed Rule-Based Paraphrasing and Its Application", journal = j-TALIP, volume = "12", number = "3", pages = "11:1--11:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499955.2499959", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Aug 19 18:39:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The growing need for text mining systems, such as opinion mining, requires a deep semantic understanding of the target language. In order to accomplish this, extracting the semantic information of functional expressions plays a crucial role, because functional expressions such as would like to and can't are key expressions to detecting customers' needs and wants. However, in Japanese, functional expressions appear in the form of suffixes, and two different types of functional expressions are merged into one predicate: one influences the factual meaning of the predicate while the other is merely used for discourse purposes. This triggers an increase in surface forms, which hinders information extraction systems. In this article, we present a novel normalization technique that paraphrases complex functional expressions into simplified forms that retain only the crucial meaning of the predicate. We construct paraphrasing rules based on linguistic theories in syntax and semantics. The results of experiments indicate that our system achieves a high accuracy of 79.7\%, while it reduces the differences in functional expressions by up to 66.7\%. The results also show an improvement in the performance of predicate extraction, providing encouraging evidence of the usability of paraphrasing as a means of normalizing different language expressions.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sudoh:2013:SBP, author = "Katsuhito Sudoh and Xianchao Wu and Kevin Duh and Hajime Tsukada and Masaaki Nagata", title = "Syntax-Based Post-Ordering for Efficient {Japanese-to-English} Translation", journal = j-TALIP, volume = "12", number = "3", pages = "12:1--12:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499955.2499960", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Mon Aug 19 18:39:55 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article proposes a novel reordering method for efficient two-step Japanese-to-English statistical machine translation (SMT) that isolates reordering from SMT and solves it after lexical translation. This reordering problem, called post-ordering, is solved as an SMT problem from Head-Final English (HFE) to English. HFE is syntax-based reordered English that is very successfully used for reordering with English-to-Japanese SMT. The proposed method incorporates its advantage into the reverse direction, Japanese-to-English, and solves the post-ordering problem by accurate syntax-based SMT with target language syntax. Two-step SMT with the proposed post-ordering empirically reduces the decoding time of the accurate but slow syntax-based SMT by its good approximation using intermediate HFE. The proposed method improves the decoding speed of syntax-based SMT decoding by about six times with comparable translation accuracy in Japanese-to-English patent translation experiments.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sproat:2013:TP, author = "Richard Sproat", title = "{TALIP} Perspectives", journal = j-TALIP, volume = "12", number = "4", pages = "13:1--13:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523057.2523058", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Oct 30 12:33:24 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Paul:2013:HCB, author = "Michael Paul and Andrew Finch and Eiichrio Sumita", title = "How to Choose the Best Pivot Language for Automatic Translation of Low-Resource Languages", journal = j-TALIP, volume = "12", number = "4", pages = "14:1--14:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2505126", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Oct 30 12:33:24 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Recent research on multilingual statistical machine translation focuses on the usage of pivot languages in order to overcome language resource limitations for certain language pairs. Due to the richness of available language resources, English is, in general, the pivot language of choice. However, factors like language relatedness can also effect the choice of the pivot language for a given language pair, especially for Asian languages, where language resources are currently quite limited. In this article, we provide new insights into what factors make a pivot language effective and investigate the impact of these factors on the overall pivot translation performance for translation between 22 Indo-European and Asian languages. Experimental results using state-of-the-art statistical machine translation techniques revealed that the translation quality of 54.8\% of the language pairs improved when a non-English pivot language was chosen. Moreover, 81.0\% of system performance variations can be explained by a combination of factors such as language family, vocabulary, sentence length, language perplexity, translation model entropy, reordering, monotonicity, and engine performance.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Huang:2013:CAT, author = "Chung-Chi Huang and Mei-Hua Chen and Ping-Che Yang and Jason S. Chang", title = "A Computer-Assisted Translation and Writing System", journal = j-TALIP, volume = "12", number = "4", pages = "15:1--15:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2505984", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Oct 30 12:33:24 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "We introduce a method for learning to predict text and grammatical construction in a computer-assisted translation and writing framework. In our approach, predictions are offered on the fly to help the user make appropriate lexical and grammar choices during the translation of a source text, thus improving translation quality and productivity. The method involves automatically generating general-to-specific word usage summaries (i.e., writing suggestion module), and automatically learning high-confidence word- or phrase-level translation equivalents (i.e., translation suggestion module). At runtime, the source text and its translation prefix entered by the user are broken down into $n$-grams to generate grammar and translation predictions, which are further combined and ranked via translation and language models. These ranked prediction candidates are iteratively and interactively displayed to the user in a pop-up menu as translation or writing hints. We present a prototype writing assistant, TransAhead, that applies the method to a human-computer collaborative environment. Automatic and human evaluations show that novice translators or language learners substantially benefit from our system in terms of translation performance (i.e., translation accuracy and productivity) and language learning (i.e., collocation usage and grammar). In general, our methodology of inline grammar and text predictions or suggestions has great potential in the field of computer-assisted translation, writing, or even language learning.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Chu:2013:CJM, author = "Chenhui Chu and Toshiaki Nakazawa and Daisuke Kawahara and Sadao Kurohashi", title = "{Chinese--Japanese} Machine Translation Exploiting {Chinese} Characters", journal = j-TALIP, volume = "12", number = "4", pages = "16:1--16:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2523057.2523059", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Oct 30 12:33:24 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Chinese and Japanese languages share Chinese characters. Since the Chinese characters in Japanese originated from ancient China, many common Chinese characters exist between these two languages. Since Chinese characters contain significant semantic information and common Chinese characters share the same meaning in the two languages, they can be quite useful in Chinese--Japanese machine translation (MT). We therefore propose a method for creating a Chinese character mapping table for Japanese, traditional Chinese, and simplified Chinese, with the aim of constructing a complete resource of common Chinese characters. Furthermore, we point out two main problems in Chinese word segmentation for Chinese--Japanese MT, namely, unknown words and word segmentation granularity, and propose an approach exploiting common Chinese characters to solve these problems. We also propose a statistical method for detecting other semantically equivalent Chinese characters other than the common ones and a method for exploiting shared Chinese characters in phrase alignment. Results of the experiments carried out on a state-of-the-art phrase-based statistical MT system and an example-based MT system show that our proposed approaches can improve MT performance significantly, thereby verifying the effectiveness of shared Chinese characters for Chinese--Japanese MT.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Goto:2013:POP, author = "Isao Goto and Masao Utiyama and Eiichiro Sumita", title = "Post-Ordering by Parsing with {ITG} for {Japanese--English} Statistical Machine Translation", journal = j-TALIP, volume = "12", number = "4", pages = "17:1--17:??", month = oct, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2518100", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Oct 30 12:33:24 MDT 2013", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Word reordering is a difficult task for translation between languages with widely different word orders, such as Japanese and English. A previously proposed post-ordering method for Japanese-to-English translation first translates a Japanese sentence into a sequence of English words in a word order similar to that of Japanese, then reorders the sequence into an English word order. We employed this post-ordering framework and improved upon its reordering method. The existing post-ordering method reorders the sequence of English words via SMT, whereas our method reorders the sequence by (1) parsing the sequence using ITG to obtain syntactic structures which are similar to Japanese syntactic structures, and (2) transferring the obtained syntactic structures into English syntactic structures according to the ITG. The experiments using Japanese-to-English patent translation demonstrated the effectiveness of our method and showed that both the RIBES and BLEU scores were improved over compared methods.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Strotgen:2014:TML, author = "Jannik Str{\"o}tgen and Ayser Armiti and Tran Van Canh and Julian Zell and Michael Gertz", title = "Time for More Languages: Temporal Tagging of {Arabic}, {Italian}, {Spanish}, and {Vietnamese}", journal = j-TALIP, volume = "13", number = "1", pages = "1:1--1:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2540989", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 27 12:18:55 MST 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Most of the research on temporal tagging so far is done for processing English text documents. There are hardly any multilingual temporal taggers supporting more than two languages. Recently, the temporal tagger HeidelTime has been made publicly available, supporting the integration of new languages by developing language-dependent resources without modifying the source code. In this article, we describe our work on developing such resources for two Asian and two Romance languages: Arabic, Vietnamese, Spanish, and Italian. While temporal tagging of the two Romance languages has been addressed before, there has been almost no research on Arabic and Vietnamese temporal tagging so far. Furthermore, we analyze language-dependent challenges for temporal tagging and explain the strategies we followed to address them. Our evaluation results on publicly available and newly annotated corpora demonstrate the high quality of our new resources for the four languages, which we make publicly available to the research community.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Goto:2014:DMB, author = "Isao Goto and Masao Utiyama and Eiichiro Sumita and Akihiro Tamura and Sadao Kurohashi", title = "Distortion Model Based on Word Sequence Labeling for Statistical Machine Translation", journal = j-TALIP, volume = "13", number = "1", pages = "2:1--2:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2537128", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 27 12:18:55 MST 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article proposes a new distortion model for phrase-based statistical machine translation. In decoding, a distortion model estimates the source word position to be translated next (subsequent position; SP) given the last translated source word position (current position; CP). We propose a distortion model that can simultaneously consider the word at the CP, the word at an SP candidate, the context of the CP and an SP candidate, relative word order among the SP candidates, and the words between the CP and an SP candidate. These considered elements are called rich context. Our model considers rich context by discriminating label sequences that specify spans from the CP to each SP candidate. It enables our model to learn the effect of relative word order among SP candidates as well as to learn the effect of distances from the training data. In contrast to the learning strategy of existing methods, our learning strategy is that the model learns preference relations among SP candidates in each sentence of the training data. This leaning strategy enables consideration of all of the rich context simultaneously. In our experiments, our model had higher BLUE and RIBES scores for Japanese-English, Chinese--English, and German-English translation compared to the lexical reordering models.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Kim:2014:CLA, author = "Seokhwan Kim and Minwoo Jeong and Jonghoon Lee and Gary Geunbae Lee", title = "Cross-Lingual Annotation Projection for Weakly-Supervised Relation Extraction", journal = j-TALIP, volume = "13", number = "1", pages = "3:1--3:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2529994", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 27 12:18:55 MST 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Although researchers have conducted extensive studies on relation extraction in the last decade, statistical systems based on supervised learning are still limited, because they require large amounts of training data to achieve high performance level. In this article, we propose cross-lingual annotation projection methods that leverage parallel corpora to build a relation extraction system for a resource-poor language without significant annotation efforts. To make our method more reliable, we introduce two types of projection approaches with noise reduction strategies. We demonstrate the merit of our method using a Korean relation extraction system trained on projected examples from an English-Korean parallel corpus. Experiments show the feasibility of our approaches through comparison to other systems based on monolingual resources.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Yahya:2014:ATC, author = "Adnan Yahya and Ali Salhi", title = "{Arabic} Text Categorization Based on {Arabic Wikipedia}", journal = j-TALIP, volume = "13", number = "1", pages = "4:1--4:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2537129", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 27 12:18:55 MST 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article describes an algorithm for categorizing Arabic text, relying on highly categorized corpus-based datasets obtained from the Arabic Wikipedia by using manual and automated processes to build and customize categories. The categorization algorithm was built by adopting a simple categorization idea then moving forward to more complex ones. We applied tests and filtration criteria to reach the best and most efficient results that our algorithm can achieve. The categorization depends on the statistical relations between the input (test) text and the reference (training) data supported by well-defined Wikipedia-based categories. Our algorithm supports two levels for categorizing Arabic text; categories are grouped into a hierarchy of main categories and subcategories. This introduces a challenge due to the correlation between certain subcategories and overlap between main categories. We argue that our algorithm achieved good performance compared to other methods reported in the literature.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Church:2014:TPG, author = "Kenneth Church", title = "{TALIP} Perspectives, Guest Editorial Commentary: What Counts (and What Ought to Count)?", journal = j-TALIP, volume = "13", number = "1", pages = "5:1--5:??", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2559789", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Thu Feb 27 12:18:55 MST 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sulaiman:2014:EJS, author = "Suliana Sulaiman and Khairuddin Omar and Nazlia Omar and Mohd Zamri Murah and Hamdan Abdul Rahman", title = "The Effectiveness of a {Jawi} Stemmer for Retrieving Relevant {Malay} Documents in {Jawi} Characters", journal = j-TALIP, volume = "13", number = "2", pages = "6:1--6:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2540988", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Jun 20 18:22:19 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Malay language has two types of writing script, known as Rumi and Jawi. Most previous stemmer results have reported on Malay Rumi characters and only a few have tested Jawi characters. In this article, a new Jawi stemmer has been proposed and tested for document retrieval. A total of 36 queries and datasets from the transliterated Jawi Quran were used. The experiment shows that the mean average precision for a ``stemmed Jawi'' document is 8.43\%. At the same time, the mean average precision for a ``nonstemmed Jawi'' document is 5.14\%. The result from a paired sample t-test showed that the use of a ``stemmed Jawi'' document increased the precision in document retrieval. Further experiments were performed to examine the precision of the relevant documents that were retrieved at various cutoff points for all 36 queries. The results for the ``stemmed Jawi'' document showed a significantly different start, at a cutoff of 40, compared with the ``nonstemmed Jawi'' documents. This result shows the usefulness of a Jawi stemmer for retrieving relevant documents in the Jawi script.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Esmaili:2014:TKI, author = "Kyumars Sheykh Esmaili and Shahin Salavati and Anwitaman Datta", title = "Towards {Kurdish} Information Retrieval", journal = j-TALIP, volume = "13", number = "2", pages = "7:1--7:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2556948", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Jun 20 18:22:19 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Kurdish language is an Indo-European language spoken in Kurdistan, a large geographical region in the Middle East. Despite having a large number of speakers, Kurdish is among the less-resourced languages and has not seen much attention from the IR and NLP research communities. This article reports on the outcomes of a project aimed at providing essential resources for processing Kurdish texts. A principal output of this project is Pewan, the first standard Test Collection to evaluate Kurdish Information Retrieval systems. The other language resources that we have built include a lightweight stemmer and a list of stopwords. Our second principal contribution is using these newly-built resources to conduct a thorough experimental study on Kurdish documents. Our experimental results show that normalization, and to a lesser extent, stemming, can greatly improve the performance of Kurdish IR systems.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sharma:2014:WPS, author = "Manoj Kumar Sharma and Debasis Samanta", title = "Word Prediction System for Text Entry in {Hindi}", journal = j-TALIP, volume = "13", number = "2", pages = "8:1--8:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2617590", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Jun 20 18:22:19 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/spell.bib; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Word prediction is treated as an efficient technique to enhance text entry rate. Existing word prediction systems predict a word when a user correctly enters the initial few characters of the word. In fact, a word prediction system fails if the user makes errors in the initial input. Therefore, there is a need to develop a word prediction system that predicts desired words while coping with errors in initial entries. This requirement is more relevant in the case of text entry in Indian languages, which are involved with a large set of alphabets, words with complex characters and inflections, phonetically similar sets of characters, etc. In fact, text composition in Indian languages involves frequent spelling errors, which presents a challenge to develop an efficient word prediction system. In this article, we address this problem and propose a novel word prediction system. Our proposed approach has been tried with Hindi, the national language of India. Experiments with users substantiate 43.77\% keystroke savings, 92.49\% hit rate, and 95.82\% of prediction utilization with the proposed word prediction system. Our system also reduces the spelling error by 89.75\%.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Keskes:2014:SAT, author = "Iskandar Keskes and Farah Benamara Zitoune and Lamia Hadrich Belguith", title = "Splitting {Arabic} Texts into Elementary Discourse Units", journal = j-TALIP, volume = "13", number = "2", pages = "9:1--9:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2601401", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Jun 20 18:22:19 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In this article, we propose the first work that investigates the feasibility of Arabic discourse segmentation into elementary discourse units within the segmented discourse representation theory framework. We first describe our annotation scheme that defines a set of principles to guide the segmentation process. Two corpora have been annotated according to this scheme: elementary school textbooks and newspaper documents extracted from the syntactically annotated Arabic Treebank. Then, we propose a multiclass supervised learning approach that predicts nested units. Our approach uses a combination of punctuation, morphological, lexical, and shallow syntactic features. We investigate how each feature contributes to the learning process. We show that an extensive morphological analysis is crucial to achieve good results in both corpora. In addition, we show that adding chunks does not boost the performance of our system.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Rubin:2014:TPG, author = "Victoria L. Rubin", title = "{TALIP} Perspectives, Guest Editorial Commentary: Pragmatic and Cultural Considerations for Deception Detection in {Asian} Languages", journal = j-TALIP, volume = "13", number = "2", pages = "10:1--10:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2605292", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Fri Jun 20 18:22:19 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In hopes of sparking a discussion, I argue for much needed research on automated deception detection in Asian languages. The task of discerning truthful texts from deceptive ones is challenging, but a logical sequel to opinion mining. I suggest that applied computational linguists pursue broader interdisciplinary research on cultural differences and pragmatic use of language in Asian cultures, before turning to detection methods based on a primarily Western (English-centric) worldview. Deception is fundamentally human, but how do various cultures interpret and judge deceptive behavior?", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Na:2014:LAN, author = "Hwidong Na and Jong-Hyeok Lee", title = "Linguistic analysis of non-{ITG} word reordering between language pairs with different word order typologies", journal = j-TALIP, volume = "13", number = "3", pages = "11:1--11:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2644810", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Oct 4 06:09:41 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "The Inversion Transduction Grammar (ITG) constraints have been widely used for word reordering in machine translation studies. They are, however, so restricted that some types of word reordering cannot be handled properly. We analyze three corpora between SVO and SOV languages: Chinese--Korean, English-Japanese, and English-Korean. In our analysis, sentences that require non-ITG word reordering are manually categorized. We also report the results for two quantitative measures that reveal the significance of non-ITG word reordering. In conclusion, we suggest that ITG constraints are insufficient to deal with word reordering in real situations.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{A:2014:AMO, author = "Bharath A. and Sriganesh Madhvanath", title = "Allograph modeling for online handwritten characters in {Devanagari} using constrained stroke clustering", journal = j-TALIP, volume = "13", number = "3", pages = "12:1--12:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629622", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Oct 4 06:09:41 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Writer-specific character writing variations such as those of stroke order and stroke number are an important source of variability in the input when handwriting is captured ``online'' via a stylus and a challenge for robust online recognition of handwritten characters and words. It has been shown by several studies that explicit modeling of character allographs is important for achieving high recognition accuracies in a writer-independent recognition system. While previous approaches have relied on unsupervised clustering at the character or stroke level to find the allographs of a character, in this article we propose the use of constrained clustering using automatically derived domain constraints to find a minimal set of stroke clusters. The allographs identified have been applied to Devanagari character recognition using Hidden Markov Models and Nearest Neighbor classifiers, and the results indicate substantial improvement in recognition accuracy and/or reduction in memory and computation time when compared to alternate modeling techniques.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Paik:2014:IBF, author = "Jiaul H. Paik and Dipasree Pal and Swapan K. Parui", title = "Incremental blind feedback: an effective approach to automatic query expansion", journal = j-TALIP, volume = "13", number = "3", pages = "13:1--13:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2611521", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Oct 4 06:09:41 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Automatic query expansion (AQE) is a useful technique for enhancing the effectiveness of information retrieval systems. In this article, we propose a novel AQE algorithm which first adopts a systematic incremental approach to choose feedback documents from the top retrieved set and then selects the expansion terms aggregating the scores from each feedback set. We also devise a term selection measure and a number of weighting schemes based on easily computable features. A set of experiments with a large number of standard test collections reveals that the proposed incremental blind feedback algorithm outperforms a number of state-of-the-art query expansion methods with remarkable significance and consistency.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Saharia:2014:SRP, author = "Navanath Saharia and Utpal Sharma and Jugal Kalita", title = "Stemming resource-poor {Indian} languages", journal = j-TALIP, volume = "13", number = "3", pages = "14:1--14:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629670", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Oct 4 06:09:41 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Stemming is a basic method for morphological normalization of natural language texts. In this study, we focus on the problem of stemming several resource-poor languages from Eastern India, viz., Assamese, Bengali, Bishnupriya Manipuri and Bodo. While Assamese, Bengali and Bishnupriya Manipuri are Indo-Aryan, Bodo is a Tibeto-Burman language. We design a rule-based approach to remove suffixes from words. To reduce over-stemming and under-stemming errors, we introduce a dictionary of frequent words. We observe that, for these languages a dominant amount of suffixes are single letters creating problems during suffix stripping. As a result, we introduce an HMM-based hybrid approach to classify the mis-matched last character. For each word, the stem is extracted by calculating the most probable path in four HMM states. At each step we measure the stemming accuracy for each language. We obtain 94\% accuracy for Assamese and Bengali and 87\%, and 82\% for Bishnupriya Manipuri and Bodo, respectively, using the hybrid approach. We compare our work with Morfessor [Creutz and Lagus 2005]. As of now, there is no reported work on stemming for Bishnupriya Manipuri and Bodo. Our results on Assamese and Bengali show significant improvement over prior published work [Sarkar and Bandyopadhyay 2008; Sharma et al. 2002, 2003].", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Sproat:2014:SJ, author = "Richard Sproat", title = "The state of the journal", journal = j-TALIP, volume = "13", number = "3", pages = "15:1--15:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2656620", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Sat Oct 4 06:09:41 MDT 2014", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820", } @Article{Bang:2014:PVP, author = "Jeesoo Bang and Jonghoon Lee and Gary Geunbae Lee and Minhwa Chung", title = "Pronunciation Variants Prediction Method to Detect Mispronunciations by {Korean} Learners of {English}", journal = j-TALIP, volume = "13", number = "4", pages = "16:1--16:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629545", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jan 7 15:23:49 MST 2015", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "This article presents an approach to nonnative pronunciation variants modeling and prediction. The pronunciation variants prediction method was developed by generalized transformation-based error-driven learning (GTBL). The modified goodness of pronunciation (GOP) score was applied to effective mispronunciation detection using logistic regression machine learning under the pronunciation variants prediction. English-read speech data uttered by Korean-speaking learners of English were collected, then pronunciation variation knowledge was extracted from the differences between the canonical phonemes and the actual phonemes of the speech data. With this knowledge, an error-driven learning approach was designed that automatically learns phoneme variation rules from phoneme-level transcriptions. The learned rules generate an extended recognition network to detect mispronunciations. Three different mispronunciation detection methods were tested including our logistic regression machine learning method with modified GOP scores and mispronunciation preference features; all three methods yielded significant improvement in predictions of pronunciation variants, and our logistic regression method showed the best performance.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820", } @Article{Liu:2014:DTL, author = "Lemao Liu and Tiejun Zhao and Taro Watanabe and Hailong Cao and Conghui Zhu", title = "Discriminative Training for Log-Linear Based {SMT}: Global or Local Methods", journal = j-TALIP, volume = "13", number = "4", pages = "17:1--17:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637478", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jan 7 15:23:49 MST 2015", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "In statistical machine translation, the standard methods such as MERT tune a single weight with regard to a given development data. However, these methods suffer from two problems due to the diversity and uneven distribution of source sentences. First, their performance is highly dependent on the choice of a development set, which may lead to an unstable performance for testing. Second, the sentence level translation quality is not assured since tuning is performed on the document level rather than on sentence level. In contrast with the standard global training in which a single weight is learned, we propose novel local training methods to address these two problems. We perform training and testing in one step by locally learning the sentence-wise weight for each input sentence. Since the time of each tuning step is unnegligible and learning sentence-wise weights for the entire test set means many passes of tuning, it is a great challenge for the efficiency of local training. We propose an efficient two-phase method to put the local training into practice by employing the ultraconservative update. On NIST Chinese-to-English translation tasks with both medium and large scales of training data, our local training methods significantly outperform standard methods with the maximal improvements up to 2.0 BLEU points, meanwhile their efficiency is comparable to that of the standard methods.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820", } @Article{Zhuang:2014:EPP, author = "Yi Zhuang and Qing Li and Dickson K. W. Chiu and Zhiang Wu and Haiyang Hu", title = "Efficient Personalized Probabilistic Retrieval of {Chinese} Calligraphic Manuscript Images in Mobile Cloud Environment", journal = j-TALIP, volume = "13", number = "4", pages = "18:1--18:??", month = dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629575", ISSN = "1530-0226 (print), 1558-3430 (electronic)", ISSN-L = "1530-0226", bibdate = "Wed Jan 7 15:23:49 MST 2015", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/talip.bib", abstract = "Ancient language manuscripts constitute a key part of the cultural heritage of mankind. As one of the most important languages, Chinese historical calligraphy work has contributed to not only the Chinese cultural heritage but also the world civilization at large, especially for Asia. To support deeper and more convenient appreciation of Chinese calligraphy works, based on our previous work on the probabilistic retrieval of historical Chinese calligraphic character manuscripts repositories, we propose a system framework of the multi-feature-based Chinese calligraphic character images probabilistic retrieval in the mobile cloud network environment, which is called the DPRC. To ensure retrieval efficiency, we further propose four enabling techniques: (1) DRL-based probability propagation, (2) optimal data placement scheme, (3) adaptive data robust transmission algorithm, and (4) index support filtering scheme. Comprehensive experiments are conducted to testify the effectiveness and efficiency of our proposed DPRC method.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Asian Language Information Processing", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820", }