@inproceedings{toms99metaphor, author = "E. G. Toms and D. G. Campbell", title = {Genre as interface metaphor: Exploiting form and function in digital environments.}, booktitle = {Proceedings of the 32nd Hawaii International Conference on System Sciences ({HICSS '99})}, year = "1999" } @inproceedings{rogati02, author = {M. Rogati and Y. Yang}, title = {High-Peforming Feature Selection for Text Classification}, booktitle = {Proceedings of ACM Conference on Information and Knowledge Management {(CIKM)} '02}, year = "2002", month = "November", address = "McLean, Virginia", publisher = "ACM Press" } @inproceedings{lau99patterns, author = {T. Lau and E. Horvitz}, title = {Patterns of Search: Analyzing and Modeling Web Query Refinement}, booktitle = {Proceedings of the Seventh International Conference on User Modeling}, year = {1999}, publisher = {ACM Press}, url = {citeseer.nj.nec.com/270469.html} } @misc{belkin00relevance, author = "N. Belkin and C. Cool and J. Head and J. Jeng and D. Kelly and S. Lin and L. Lobash and S. Park and P. Savage-Knepshield and C. Sikora", title = {Relevance feedback versus local context analysis as term suggestion devices}, year = "2000", url = "citeseer.nj.nec.com/belkin00relevance.html" } @Book{manning99statnlp, author = "C. Manning and H. Sch{\"u}tze", title = {Foundations of Statistical Natural Language Processing}, year = "1999", publisher = "The MIT Press", address = "Cambridge, Massachusetts" } @Book{browncorpus, author = "W. Francis and H. Kucera", title = {Frequency Analysis of English Usage}, year = "1982", publisher = "Houghton Mifflin Co", address = "New York" } @Book{mitchell97ml, author = "T. M. Mitchell", title = {Machine Learning}, year = "1997", publisher = "McGraw-Hill", address = "New York" } @Book{BaezaYates99, author = "Ricardo {Baeza-Yates} and Berthier {Ribeiro-Neto}", title = {Modern Information Retrieval}, year = "1999", publisher = "Addison Wesley", address = "Reading, US", } @Article{KleLaw01, author = "Kleinberg and Lawrence", title = {The Structure of the Web}, journal = {SCIENCE: Science}, volume = "294", year = "2001", } @misc{chau-personalized, author = "M. Chau and H. Chen", title = {Personalized and Focused Web Spiders}, lastchecked = "February 19th, 2003", month = "February", year = "2003", url = "citeseer.nj.nec.com/548327.html" } @misc{kobayashi99information, author = "M. Kobayashi and K. Takeda", title = {Information Retrieval on the web: Selected Topics}, text = {M. Kobayashi and K. Takeda, Information Retrieval on the web: Selected Topics, IBM Research, Tokyo Research Laboratory, IBM Japan, 1999.}, year = "1999", url = "citeseer.nj.nec.com/kobayashi99information.html" } @inproceedings{glover99recommending, author = "E. Glover and S. Lawrence and M. D. Gordon and W. Birmingham and C. L. Giles", title = {Recommending Web Documents Based on User Preferences}, booktitle = {Proceedings of Special Interest Group on Information Retrieval {(SIGIR)} 99 Workshop on Recommender Systems}, address = "Berkeley, CA", month = "August", year = "1999", url = "citeseer.nj.nec.com/glover99recommending.html" } @inproceedings{glover01improving, author = "E. Glover and G. Flake and S. Lawrence and W. P. Birmingham and A. Kruger and C. L. Giles and D. Pennock", title = {Improving Category Specific Web Search by Learning Query Modifications}, booktitle = {Symposium on Applications and the Internet, {SAINT}}, address = "San Diego, CA", month = "January 8--12", year = "2001", url = "citeseer.nj.nec.com/article/glover01improving.html" } @Article{Gershenson02Discrim, author = "C. Gershenson and M. A. Porter and A. Probst and M. Marko and A. Das", title = {A Study on the Relevance of Information in Discriminative and Non-Discriminative Media}, journal = {InterJournal of Complex Systems}, volume = "533", year = "2002" } @inproceedings{han98webace, author = "E. Han and D. Boley and M. Gini and R. Gross and K. Hastings and G. Karypis and V. Kumar and B. Mobasher and J. Moore", title = {{WebACE}: {A} Web Agent for Document Categorization and Exploration}, booktitle = {Proceedings of the 2nd International Conference on Autonomous Agents (Agents'98)}, month = "~9--13,", publisher = "ACM Press", address = "New York", editor = "K. P. Sycara and M. Wooldridge", isbn = "0-89791-983-1", pages = "408--415", year = "1998", url = "citeseer.nj.nec.com/han98webace.html" } @article{vinokourov-probabilistic, author = "A. Vinokourov and M. Girolami", title = {A Probabilistic Framework for the Hierarchic Organisation and Classification of Document Collections}, journal = {Information Processing and Management}, year = "2002" } @misc{somlo-proposal, author = "G. Somlo", title = "Thesis Proposal", year = "1999", url = "www.cs.colostate.edu/~somlo/proposal.ps.gz" } @inProceedings{karlgren98iterative, author = "J. Karlgren and I. Bretan and J. Dewe and A. Hallberg and N. Wolkert", title = {Iterative Information Retrieval Using Fast Clustering and Usage-Specific Genres}, booktitle = {Proceedings of the Eighth DELOS Workshop on User Interfaces in Digital Libraries}, address = {Stockholm}, pages = "85--92", month = "October", year = "1998", url = "http://citeseer.ist.psu.edu/karlgren98iterative.html" } @inproceedings{finn03learning, author = "A. Finn and N. Kushmerick", title = {Learning to classify documents according to genre}, year = "2003", booktitle = {{IJCAI}-03 Workshop on Computational Approaches to Style Analysis and Synthesis}, url = "citeseer.nj.nec.com/562083.html" } @InProceedings{WeigueFan:1999:agmfGPeir, author = "W. Fan and M. D. Gordon and P. Pathak", title = {Automatic generation of matching functions by genetic programming for effective information retrieval"}, booktitle = {Proceedings of the 1999 Americas Conference on Information Systems"}, year = "1999", editor = "W. D. Haseman and D. L. Nazareth", pages = "49--51", address = "Milwaukee, WI, USA", month = "13-15 ", organisation = "Association for Information Systems", keywords = "genetic algorithms, genetic programming", URL = "http://filebox.vt.edu/users/wfan/paper/Amcis_final.pdf", size = "3 pages", abstract = "With the advent of the Internet, online resources are increasingly available. Many users choose popular search engines to perform an online search to satisfy their information need. However, these search engines tend to turn up many non-relevant documents, which make their retrieval precision very low. How to find appropriate ranking metrics to retrieve more relevant documents and fewer non-relevant documents for users remains a big challenge to the information retrieval community. In this paper, we propose a new framework that combines the merits of genetic programming and relevance feedback techniques to automatically generate and refine the matching functions used for document ranking. This approach overcomes the shortcoming of traditional ranking algorithms using a fixed ranking strategy. It also gives some new ideas and hints for information retrieval professionals.", notes = "AMCIS99 https://commerce.mindspring.com/www.icisnet.org/proc.html Prototype implemented in C. Fitness based on user feedback", } @InProceedings{pathakgordonfan2000, author = "P. Pathak and M. Gordon and W. Fan", title = {Effective information retrieval using genetic algorithms based matching functions adaptation}, booktitle = {Proceedings of the 33rd Annual Hawaii International Conference on System Sciences, 2000}, year = "2000", month = "January 4--7", pages = "533-540", isbn = "0-7695-0493-0/00", address = "Purdue University" } @article{dss99, author= "D. Boley and M. Gini and R. Gross and S. Han and K. Hastings and G. Kary pis and V. Kumar and B. Mobasher and J. Moore", title= {Partitioning-Based Clustering for Web Document Categorization}, journal= {Decision Support Systems}, volume= "27", number= "3", pages= "329-341", year= "1999", url = "citeseer.nj.nec.com/9105.html" } @article{yu2003PageSegmentation, author = {S. Yu and D. Cai and J. Wen and W. Ma}, title = {Improving Pseudo-Relevance Feedback in Web Information Retrieval Using Web Page Segmentation}, journal = {The Twelfth International World Wide Web Conference (WWW2003)}, month = "May 20--24", year = "2003" } @InProceedings{leemyaeng2002genrefeatures, author = "Y. Lee and S. H. Myaeng", title = {Text Genre Classification with Genre-Revealing and Subject-Revealing Features}, booktitle = {Proceedings of Special Interest Group on Information Retrieval {(SIGIR)} '02}, year = "2002", month = "August 11--15", pages = "145-149" } @inproceedings{marinheiro98expanding, author = "R. N. Marinheiro and W. Hall", title = {Expanding a Hypertext Information Retrieval System to Incorporate Multimedia Information}, booktitle = {Proceedings of the Hawaii International Conference on System Sciences {(HICSS 31)} (2)}, pages = "286-295", year = "1998", url = "citeseer.nj.nec.com/203305.html" } @inproceedings{mao00automatic, author = "X. Mao and M. Kikukawa and K. Kashio and A. Imamiya", title = {Automatic Generation of Hair Texture with Line Integral Convolution}, booktitle = "{IV}", pages = "303-308", year = "2000", url = "citeseer.nj.nec.com/mao00automatic.html" } @inproceedings{mao00pset, author = "S. Mao and T. Kanungo", title = {{PSET}: A Page Segmentation Evaluation Toolkit}, booktitle = {Fourth {IAPR} International Workshop on Document Analysis Systems}, address = "Rio de Janeiro, Brazil", month = "December", year = "2000", url = "citeseer.nj.nec.com/mao00pset.html" } @inproceedings{mao00empirical, author = {S. Mao and T. Kanungo}, title = {Empirical Performance Evaluation of Page Segmentation Algorithms}, booktitle = {Proceedings of SPIE Conference on Document Recognition}, address = "San Jose, CA", month = "January", year = "2000", url = "citeseer.nj.nec.com/mao00empirical.html" } @inproceedings{wang01zone, author = "Y. Wang and R. Haralick and I. Phillips", title = {Zone Content Classification and its Performance Evaluation}, booktitle = {Sixth International Conference on Document Analysis and Recognition(ICDAR01)}, pages = "540-544", address = "Seattle, WA", month = "September", year = "2001", url = "citeseer.nj.nec.com/wang01zone.html" } @misc{ wang-document, author = "Y. Wang", title = {Document ANalysis: Table Structure Understanding and Zone Content Classification}, publisher="Ph.D. thesis, Univ. of Washington", year="2002", url = "citeseer.nj.nec.com/wang02document.html" } @article{kosala00web, author = "Kosala and Blockeel", title = {Web Mining Research: {A} Survey}, journal = {SIGKDD Explorations: Newsletter of the Special Interest Group (SIG) on Knowledge Discovery & Data Mining, ACM}, volume = "2", publisher = "ACM", year = "2000", url = "citeseer.nj.nec.com/kosala00web.html" } @inproceedings{etemad94page, author = "K. Etemad and D. Doermann and R. Chellappa", title = {Page Segmentation Using Decision Integration and Wavelet Packets}, booktitle = {12th International Conference on Pattern Recognition}, volume = "2", address = "Jerusalem, Israel", pages = "345--349", year = "1994", url = "citeseer.nj.nec.com/etemad94page.html" } @article{yoshioka01genre, author = {T. Yoshioka and G. Herman and J. Yates and W. J. Orlikowski}, title = {Genre Taxonomy: A Knowledge Repository of Communicative Actions}, journal = {Information Systems}, volume = "19", number = "4", pages = "431-456", year = "2001", url = "citeseer.ist.psu.edu/yoshioka99genre.html" } @misc{taketa99nunokawa, author = "K. Abe and T. Taketa and H. Nunokawa", title = {An Efficient Information Retrieval Method in WWW Using Genetic Algorithms.}, text = "ICPP Workshops 1999", year = "1999", pages = "522--527" } @inproceedings{marmelstein98lamont, author = "R. E. Marmelstein and G. B. Lamont", title = {Pattern Classication using a Hybrid Genetic Program - Decision Tree approach.}, booktitle = {Proceedings of the Third Annual Conference on Genetic Programming}, publisher = "Morgan Kaufmann", year = "1998" } @inproceedings{kraft94buckles, author = "D. H. Kraft and F. E. Petry and B. P. Buckles and T. Sadasivan", title = "The use of genetic programming to build queries for information retrieval", booktitle = "Proceedings IEEE Symp. Evol. Comput., 1994.", year = "1994" } @inproceedings{furuta95genre, author = "R. Furuta and C. C. Marshall", title = {Genre as Reflection of Technology in the World-Wide Web}, booktitle = "{IWHD}", pages = "182-195", year = "1995", url = "citeseer.nj.nec.com/furuta96genre.html" } @article{yates99explicitorg, author = {J. Yates and W.J. Orlikowski}, title = {Explicit and Implicit Structuring of Genres}, journal = {Organization Science}, volume = "10", pages = "83-103", year = "1999" } @inproceedings{yates97digital, author = {S. Yates and T. Sumner}, title = {Digital Genres and the New Burden of Fixity}, booktitle = "{Hawaiian International Conference on System Sciences (HICSS 30)}", address = "Wailea, Hawaii", publisher = {IEEE Computer Press}, month = "January 7-10", year = "1997", url = "citeseer.nj.nec.com/yates97digital.html" } @article{delinmodel, author = "J. Delin and J. Bateman and P. Allen", title = {A Model of Genre in Document Layout}, journal = "Information Design Journal", year = "forthcoming", url = "citeseer.nj.nec.com/411087.html" } @article{dillon00genres, author = "A. Dillon and B. A. Gushrowski", title = {Genres and the {WEB}: Is the Personal Home Page the First Uniquely Digital Genre?}, journal = {Journal of the American Society of Information Science}, volume = "51", number = "2", pages = "202-205", year = "2000", url = "citeseer.nj.nec.com/567742.html" } @inproceedings{ crowston97emergent, author = "K. Crowston and M. Williams", title = {Reproduced and Emergent Genres of Communication on the World-Wide Web}, booktitle = "Proceedings of the 30th Hawaiian International Conference on System Sciences", publisher = "{IEEE} Computer Press.", address = "Wailea, Hawaii", month = "January 7-10", year = "1997" } @inproceedings{dewdney01substance, author = "N. Dewdney and C. VanEss-Dykema and R. MacMillan", title = {The Form is the Substance: Classification of Genres in Text}, booktitle = {{ACL} Workshop on Human Lanugage Technology and Knowledge Management}, address = "Toulouse, France", month = "July 6-7", year = "2001", url = "http://www.elsnet.org/km2001/dewdney.pdf" } @inproceedings{crowston99effects, author = "K. Crowston and M. Williams", title = {The Effects of Linking on Genres of Web Documents}, booktitle = {Proceedings of the Hawaiian International Conference on System Sciences}, address = "Hawaii", year = "1999", url = "citeseer.nj.nec.com/crowston99effects.html" } @inproceedings{karlgren94recognizing, author = "J. Karlgren and D. Cutting", title = {Recognizing Text Genres with Simple Metrics using Discriminant Analysis}, booktitle = {Proceedings of the 15th. International Conference on Computational Linguistics ({\sc Coling 94})}, volume = "II", address = "Kyoto, Japan", pages = "1071 -- 1075", year = "1994", url = "citeseer.nj.nec.com/karlgren94recognizing.html" } @article{crowston03collections, author = "K. Crowston and B.H. Kwasnik", title = {Can Document-Genre Metadata Improve Information Access to Large Digital Collections?}, journal = {Library Trends}, month = "Fall", year = "2003" } @inproceedings{argamon98routing, author = "S. Argamon and M. Koppel and G. Avneri", title = {Routing Documents According to Style}, booktitle = {First International Workshop on Innovative Information Systems}, year = "1998", url = "citeseer.nj.nec.com/argamon98routing.html" } misc{andersen-genres, author = "P. B. Andersen", title = {Genres as Self-organising Systems}, url = "citeseer.nj.nec.com/251963.html" } @misc{kovacevic-web, author = "M. Kovacevic", title = {Web Page Classification Using Spatial}, url = "citeseer.nj.nec.com/538247.html" } @inproceedings{riboni-feature, author = "D. Riboni", title = {Feature Selection for Web Page Classification}, booktitle = {EURASIA-ICT 2002 Proceedings of the Workshops}, editor = "A. M. Tjoa", year = "2003", url = "citeseer.nj.nec.com/riboni02feature.html" } @misc{asirvatham-kranthi-web, author = "A. P. Asirvatham and K. K. Ravi", title = {Web Page Classification based on Document Structure}, url = "citeseer.nj.nec.com/asirvatham01web.html" } @inproceedings{yang97comparative, author = "Y. Yang and J. O. Pedersen", title = {A Comparative Study on Feature Selection in Text Categorization}, booktitle = {Proceedings of {ICML}-97, 14th International Conference on Machine Learning}, publisher = {Morgan Kaufmann Publishers}, address = "San Francisco", editor = "D. H. Fisher", pages = "412--420", year = "1997", url = "citeseer.nj.nec.com/yang97comparative.html" } @article{yang99evaluation, author = {Y. Yang}, title = {An Evaluation of Statistical Approaches to Text Categorization}, journal = {Information Retrieval}, volume = "1", number = "1/2", publisher = {Kluwer Academic Publishers}, pages = "69--90", year = "1999", url = "citeseer.nj.nec.com/yang97evaluation.html" } @misc{wong00incremental, author = "W. Wong and A. Fu", title = {Incremental Document Clustering for Web Page Classification}, text = "Wai-chiu Wong and Ada Fu, Incremental Document Clustering for Web Page Classification, IEEE 2000 Int. Conf. on Info. Society in the 21st century: emerging technologies and new challenges (IS2000), Nov 5-8, 2000, Japan", year = "2000", url = "citeseer.ist.psu.edu/article/wong01incremental.html" } @inproceedings{iyer00boosting, author = "R. D. Iyer and D. D. Lewis and R. E. Schapire and Y. Singer and A. Singhal", title = {Boosting for Document Routing}, booktitle = "Proceedings of 9th {ACM} International Conference on Information and Knowledge Management {(CIKM)} '00", publisher = "ACM Press", address = "New York, US", editor = "A. Agah and J. Callan and E. Rundensteiner", pages = "70--77", year = "2000", url = "citeseer.ist.psu.edu/iyer00boosting.html" } @inproceedings{karjalainen00genrebased, author = "A. Karjalainen and T. Printa and P. Tyrvainen and J. Rajala", title = {Genre-based Metadata for Enterprise Document Management}, booktitle = "Proceedings of the 33rd Hawaii International Conference on System Sciences ({HICSS '00})", year = "2000", url = "citeseer.nj.nec.com/karjalainen00genrebased.html" } @misc{yoshioka-coordinating, author = "T. Yoshioka and G. Herman", title = {Coordinating Information Using Genres}, journal = "(CCS No. 214, Sloan No. 4127)", address = "Center for Coordination Science", year = "2000", url = "citeseer.nj.nec.com/yoshioka00coordinating.html" } @misc{antunes-applying, author = "P. Antunes and C. J. Costa and J. F. Dias", title = {Applying Genre Analysis to EMS Design: The Example of a Small Accounting Firm}, url = "citeseer.nj.nec.com/446433.html" } @misc{karjalainen-bridging, author = "A. Karjalainen and A. Salminen", title = {Bridging the Gap Between Hard and Soft Information Genres}, url = "citeseer.nj.nec.com/489131.html" } @article{yates92genresorganiz, author = {J. Yates and W.J. Orlikowski}, title = {Genres of Organizational Communication: A Structurational Approach to Studying Communication and Media}, journal = {Academy of Management Review}, volume = "17", pages = "299--326", year = "1992" } @inproceedings{roussinov01navigation, author = "D. Roussinov and K. Crowston and M. Nilan and B. Kwasnik and J. Cai and X. Liu", title = {Genre Based Navigation on the Web}, booktitle = "Proceedings of the 34th Hawaiian International Conference on System Sciences", publisher = {{IEEE} Computer Press.}, address = "Hawaii", year = "2001" } @inproceedings{rehm02autogenre, author = "G. Rehm", title = {Towards Automatic Web Genre Identification}, booktitle = "Proceedings of the Hawaiian International Conference on System Sciences", publisher = "{IEEE} Computer Press.", address = "Oahu, HI", month = "January 7-10", year = "2002" } @inproceedings{yates97collab, author = "J. Yates and W.J. Orlikowski and J. Rennecker", title = {Collaborative Genres for Collaboration: Genre Systems in Digital Media}, booktitle = "Proceedings of the Thirtieth Hawaiian International Conference on System Sciences (HICCS 30)", publisher ="{IEEE} Computer Press.", address = "Wailea, Hawaii", month = "January 7-10", year = "1997" } @article{davidlee01genresbnc, author = {D. Lee}, title = {{Genres, Registers, Text Types, Domains, and Styles: Clarifying the Concepts and Navigation a Path Through the {BNC} Jungle}}, journal = {Language Learning and Technology}, volume = "5", number = "3", month = "September", year = "2001", pages = "37--72", url = "http://llt.msu.edu/vol5num3/pdf/lee.pdf" } @misc{malone99online, author = "T.W. Malone and K. Crowston and J. Lee and B. Pentland and C. Dellarocas and G. Wyner and J. Quimby and C.S. Osborn and A. Bernstein", title = {Tools for Inventing Organizations: Toward a Handbook of Organizational Process}, url = "http://ccs.mit.edu/CCSWP198/index.htm" } @inproceedings{kessler97automatic, author = "B. Kessler and G. Nunberg and H. Sch{\"u}tze", title = {Automatic Detection of Text Genre}, booktitle = "Proceedings of the Thirty-Fifth Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics and Eighth Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics", publisher = "Association for Computational Linguistics", address = "Somerset, New Jersey", editor = "P. R. Cohen and W. Wahlster", pages = "32--38", year = "1997", url = "citeseer.ist.psu.edu/article/kessler97automatic.html" } @inproceedings{ karlgren97visualizing, author = "J. Karlgren and T. Straszheim", title = {Visualizing Stylistic Variation}, text = "Jussi Karlgren and Troy Straszheim. 1997. Visualizing Stylistic Variation", booktitle = "Proceedings of the Hawaiian International Conference on System Sciences (HICCS 30)", address = "Maui, Hawaii", year = "1997", url = "citeseer.nj.nec.com/karlgren97visualizing.html" } @misc{ karlgren96experiments, author = "J. Karlgren and T. Straszheim", title = {Experiments in Stylistic Analysis}, text = "Jussi Karlgren and Troy Straszheim. 1996. Experiments in Stylistic Analysis.", year = "1996", url = "citeseer.nj.nec.com/karlgren96experiments.html" } @Article{asmussen94corpus, author = "J. Asmussen", title = {The Text Corpus and Corpus Retrieval System}, journal = "Hermes, Journal of Linguistics", volume = "13", year = "1994" } @misc{wang-improvement, author = "Y. Wang and R. Haralick and I. Phillips", title = {Improvement of Zone Content Classification by using Background Analysis}, text = "Y. Wang, R. Haralick, and I. T. Phillips. Improvement of zone content classification by using background analysis. In Fourth IAPR International Workshop on Document Analysis Systems. (DAS2000).", note = {Fourth IAPR International Workshop on Document Analysis Systems. (DAS2000)}, year = "2000", url = "citeseer.nj.nec.com/557432.html" } @misc{htmlparserSourceforge, author = "Oswald", title = "{HTMLP}arser", version = "1.3", month = "October", year = "2003", note = "http://htmlparser.sourceforge.net" } @misc{SandnesSprell, author = "F. E. Sandnes", title = {sprell: Spell Checker and Dictionary}, year = "2003", month = "January", version = "0.34", note = "http://www.iu.hio.no/~frodes/sprell/sprell.html" } @misc{porterstemmer, author = "M. Porter", title = {Porter Stemming Algorithm}, note = "http://www.tartarus.org/~martin/PorterStemmer" } @misc{transitionals, title = "Transitional Word List", author = "Purdue University Online Writing Lab", note = "http://owl.english.purdue.edu/handouts/general/gl transition.html" } @misc{pronounlist, title = "Proper Names List", author = "U.S. Census Bureau, Population Division", note = "http://www.census.gov/genealogy/names", month = "May", year = "1995" } @misc{cmutoolkit, author = "P. Clarkson and R. Rosenfeld", title = "{CMU}-Cambridge Statistical Language Modeling toolkit", year = "2003", version = "2.05", note = "http://mi.eng.cam.ac.uk/~prc14/toolkit.html" } @misc{stopwordlist, author = "SourceForge", title = "Stop-Word List", note = "http://wordlist.sourceforge.net", month = "October", year = "2003" } @misc{imageAnalysis, author = "M. Schmidt", title = "Image Analysis", note = "http://www.geocities.com/marcoschmidt.geo/image-info.html" } @misc{osAntiword, author = "A. Van Os", title = "Antiword: Converts Microsoft Word documents to ascii text", year = "1998-2003", version = "0.34", note = "http://www.winfield.demon.nl" } @unpublished{breure01, author = "L. Breure", title = {Development of the Genre Concept}, address = "Information and Computing Sciences, University of Utrecht, The Netherlands", note = "http://www.cs.uu.nl/people/leen/GenreDev/GenreDevelopment.htm", month = "August", year = "2001" } @unpublished{McCallumLibbow, author = "A. K. McCallum", title = {Bow: A toolkit for statistical language modeling, text retrieval, classification and clustering}, note = "http://www.cs.cmu.edu/~mccallum/bow", year = "1996" } @inproceedings{ croft94evaluation, author = "W. B. Croft and S. M. Harding and K. Taghva and J. Borsack.", title = "{A}n {E}valuation of {I}nformation {R}etrieval {A}ccuracy with {S}imulated {OCR} {O}utput", booktitle = "Symposium of Document Analysis and Information Retrieval, {ISRI}-{UNLV}", year = "1994", url = "citeseer.nj.nec.com/croft92evaluation.html" } @misc{ kanungo01trueviz, author = "T. Kanungo and C. H. Lee and J. Czorapinski and I. Bella", title = {TRUEVIZ: a groundtruth/metadata editing and visualizing toolkit for OCR}, text = "T. Kanungo, C. H. Lee, J. Czorapinski, and I. Bella. TRUEVIZ: a groundtruth/metadata editing and visualizing toolkit for OCR. In Proc. of SPIE Conference on Document Recognition and Retrieval, Jan. 2001.", month = "January", year = "2001", url = "citeseer.nj.nec.com/384635.html" } @inproceedings{ shin00classification, author = "C. Shin and D. Doermann", title = {Classification of Document Page Images based on Visual Similarity on Layout Structures}, text = "C.K. Shin and D.S. Doermann. Classification of document page images based on visual similarity on layout structures. In Proc. SPIE Vol. 3967, Document Recognition and Retrieval VII, Daniel P. Lopresti; Jiangying Zhou; Eds., pages 182--190, San Jose, California, 2000.", booktitle = {Proceedings SPIE Vol. 3967, Document Recognition and Retrieval VII}, editors = "D. P. Lopresti and J. Zhou", pages = "182--190", address = "San Jose, California", year = "2000", url = "citeseer.nj.nec.com/shin00classification.html" } @inproceedings{ rauber01integrating, author = "A. Rauber and A. M{\"u}ller-Kogler", title = {Integrating Automatic Genre Analysis into Digital Libraries}, booktitle = "Proceedings of the First {ACM}/{IEEE} Joint Conference on Digital Libraries", pages = "1--10", month = "June 24-28", year = "2001", address = "Roanoke, VA", url = "citeseer.ist.psu.edu/rauber01integrating.html" } @misc{ honkaranta-evaluating, author = "A. Honkaranta", title = {Evaluating the 'Genre Lens' for Analyzing Requirements for Content Assembly}, url = "citeseer.ist.psu.edu/597058.html" } @article{procter98genres, author = "R. Procter and A. Goldenberg and E. Davenport and A. McKinlay", title = {Genres in Support of Collaborative Information Retrieval in the Virtual Library}, journal = "Interacting with Computers", volume = "10", number = "2", pages = "157-175", year = "1998", url = "citeseer.ist.psu.edu/procter97genres.html" } @inproceedings{ wang98discovering, author = "K. Wang and H. Liu", title = {Discovering Typical Structures of Documents: {A} Road Map Approach}, booktitle = "21st Annual International {ACM {SIGIR}} Conference on Research and Development in Information Retrieval", pages = "146--154", year = "1998", url = "citeseer.ist.psu.edu/wang98discovering.html" } @inproceedings{crowston04multi, author = {K. Crowston and B. Kwasnik}, title = {A Framework for Creating a Facetted Classification for Genres: Addressing Issues of Multidimensionality}, booktitle = {Proceedings of the 37th Annual Hawaii International Conference on System Sciences}, publisher = {IEEE Computer Society}, address = {Oahu, HI}, isbn = {0-7695-2056-1}, year = {2004}, url = {http://crowston.syr.edu/papers/DDGDD01.pdf} } @inproceedings{dewe98corpus, author = {J. Dewe and J. Karlgren and I. Bretan}, title = {Assembling a balanced corpus from the internet}, booktitle = {Proceedings of the 11th Nordic Conference on Computational Linguistics}, address = {Copenhagen}, year = {1998}, url = "http://www.sics.se/humle/projects/DropJaw/korpus.html" } @inproceedings{stamatatos00common, author = {E. Stamatatos and N. Fakotakis and G. Kokkinakis}, title = {Text genre detection using common word frequencies}, booktitle = {Proceedings of the 18th International Conference on Computational Linguistics}, publisher = {Association for Computational Linguistics}, year = {2000}, isbn = {1-555-55555-1}, pages = {808--814}, volume = "2", address = "Luxembourg" } @misc{readability, author = "wats.ca", url = "http://wats.ca/resources/determiningreadability/1" } @inproceedings{ihlstrom04swedish, author = {C. Ihlstr{\"o}m and M. Akesson}, title = {Genre Characteristics — A Front Page Analysis of 85 Swedish Online Newspapers}, booktitle = {Proceedings of the 37th Annual Hawaii International Conference on System Sciences}, month = "January", address = {Oahu, HI}, year = {2004} } @inproceedings{shepherd99cybergenres, author = {M. Shepherd and C. Watters}, title = {The Functionality Attribute of Cybergenres}, booktitle = {Proceedings of the 32nd Annual Hawaii International Conference on System Sciences}, month = "January", year = {1999} } @inproceedings{dumais00hierarchical, author = {S. Dumais and H. Chen}, title = {The Functionality Attribute of Cybergenres}, booktitle = {Proceedings of Special Interest Group on Information Retrieval {(SIGIR)} '00}, year = {2000} } @article{nigam00em, author = {K. Nigam and A. K. McCallum and S. Thrun and T. Mitchell}, title = {Text Classification from Labeled and Unlabeled Documents using EM}, journal = {Machine Learning}, volume = {39}, number = {2-3}, year = {2000}, issn = {0885-6125}, pages = {103--134}, publisher = {Kluwer Academic Publishers}, } @inproceedings{liu04reading, author = {X. Liu and W. B. Croft and P. Oh and D. Hart}, title = {Automatic Recognition of Reading Levels from User Queries}, booktitle = {Proceedings of Special Interest Group on Information Retrieval {(SIGIR)} '04}, address = {Sheffield, England}, month = {July}, year = {2004} } @inproceedings{li98class, author = "Y. Li and A.K. Jain", title = {Classification of Text Documents}, booktitle = {International Conference on Pattern Recognition}, year = "1998", volume = "Vol II"} @inproceedings{ kononenko94estimating, author = "I. Kononenko", title = {Estimating Attributes: Analysis and Extensions of {RELIEF}}, booktitle = "European Conference on Machine Learning", pages = "171-182", year = "1994", url = "citeseer.ist.psu.edu/kononenko94estimating.html" } @inproceedings{ brill92simple, author = "Eric Brill", title = {A Simple Rule-Based Part-of-Speech Tagger}, booktitle = "Proceedings of {ANLP}-92, 3rd Conference on Applied Natural Language Processing", address = "Trento, IT", pages = "152--155", year = "1992", url = "citeseer.ist.psu.edu/brill92simple.html" } @article{flesch48scale, author = {R. Flesch}, title = {A new readability yardstick}, journal = {Journal of Applied Psychology}, volume = {32}, year = {1948}, pages = {221--233} } @article{coleman75liau, author = {M. Colemand and T. L. Liau}, title = {A computer readability formula designed for machine scoring}, journal = {Journal of Applied Psychology}, volume = {60}, year = {1975}, pages = {283--284} } @misc{kincaid75, author = {J. P. Kincaid and R. P. Fishburne Jr. and R. L. Rogers and B. S. Chissom}, title = {Derivation of new readability formulas (Automated Readability Index, Fog Count and Flesch Reading Ease Formula) for Navy enlisted personnel}, booktitle = {Research Branch Report 8-75}, address = {Millington, TN}, misc = {Naval Technical Training, U. S. Naval Air Station Memphis}, year = {1975} } @book{boese05thesis, author = "E. S. Boese", title = {Stereotyping the Web: Genre Classification of Web Documents}, year = "2005", publisher = "CSU", address = "Fort Collins, CO" } @book{witten00weka, author = "I. H. Witten and E. Frank", title = {Data Mining: Practical machine learning tools with Java implementations}, year = "2000", publisher = "Morgan Kaufmann", address = "San Francisco" } @Book{cherry81style, author = "L.L. Cherry and W. Vesterman", title = {Writing Tools The STYLE and DICTION programs}, booktitle = {Computer Science Technical Report}, version = "91", publisher = "Bell Laboratories", address = "Murray Hill, N.J.", year = "1981", note = {Republished as part of the 4.4BSD User's Supplementary Documents by O'Reilly} } @inproceedings{cohen96contextsensitive, author = "W. W. Cohen and Y. Singer", title = {Context-Sensitive Learning Methods for Text Categorization}, booktitle = "Proceedings of {SIGIR}-96, 19th {ACM} International Conference on Research and Development in Information Retrieval", publisher = "ACM Press, New York, US", address = "Z{\"{u}}rich, CH", editor = "H. Frei and D. Harman and P. Sch{\"{a}}uble and R. Wilkinson", pages = "307--315", year = "1996", url = "citeseer.ist.psu.edu/cohen96contextsensitive.html" } @inproceedings{forman04pitfall, author = {G. Forman}, title = {A Pitfall and Solution in Multi-Class Feature Selection for Text Classification}, booktitle = {Twenty-first international conference on Machine learning}, year = {2004}, isbn = {1-58113-828-5}, address = {Banff, Alberta, Canada}, url = {http://doi.acm.org/10.1145/1015330.1015356}, publisher = {ACM Press} } @phdthesis{nigam01using, author = "Kamal Nigam", title = {Using Unlabeled Data to Improve Text Classification}, address = "Pittsburgh, US", year = "2001", url = "citeseer.ist.psu.edu/nigam01using.html" } @misc{ mccallum98comparison, author = "A. McCallum and K. Nigam", title = {A Comparison of Event Models for Naive Bayes Text Classification}, text = "A. McCallum and K. Nigam. A comparison of event models for Naive Bayes text classification. In AAAI-98 Workshop on Learning for Text Categorization, 1998.", year = "1998", url = "citeseer.ist.psu.edu/mccallum98comparison.html" } @misc{wilton01words, author = "D. Wilton", title = {How Many Words Are There In The English Language?}, year = "2001", month = "February", note = "http://www.wordorigins.org/number.htm", url = "http://www.wordorigins.org/number.htm" } @inproceedings{ illouz00nlp, author = "G. Illouz", title = {Sublanguage Dependent Evaluation of Language Resources}, booktitle = {International Conference on Language, Resources and Evaluation}, address = "Athens, Greece", year = "2000" } @inproceedings{ meyer04genreclass, author = "S. Meyer zu Eissen and B. Stein", title = {Genre Classification of Web Pages}, booktitle = {the Proceedings of the 27th German Conference on Artificial Intelligence (KI-2004)}, address = "Ulm, Germany", month = "September 20-24", year = "2004", url = "http://www-ai.upb.de/aisearch/ki04-frame.pdf" } @inproceedings{kira92relieff, author = {K. Kira and L. Rendell}, title = {A Practical Approach to Feature Selection}, booktitle = {Proceedings of the Ninth International Conference on Machine Learning}, pages = {249--256}, year = {1992} } @book{rijsbergen79fmeasure, author = {C. J. van Rijsbergen}, title = {Information Retireval}, publisher = {Butterworths}, address = {London}, year = {1979} } @inproceedings{ rauber-putting, author = "Andreas Rauber and Oliver Witvoet and Andreas Aschenbrenner and Robert Bruckner", title = "Putting the World Wide Web into a Data Warehouse: A DWH-based Approach to Web Analysis", booktitle = "Proceedings of the DEXA Workshop on Very Large Data Warehouses (VLDWH 2002)", address = "Aix-en-Provence, France", month = "September 2-6", year = "2002", publisher = "IEEE", url = "citeseer.ist.psu.edu/rauber02putting.html" } @inproceedings{ lodhi00text, author = "Huma Lodhi and John Shawe-Taylor and Nello Cristianini and Christopher J. C. H. Watkins", title = "Text Classification using String Kernels", booktitle = "{NIPS}", pages = "563--569", year = "2000", url = "citeseer.ist.psu.edu/article/lodhi02text.html" } @inproceedings{ joachims01composite, author = "T. Joachims and N. Cristianini and J. Shawe-Taylor", title = "Composite Kernels for Hypertext Categorisation", booktitle = "Proceedings of {ICML}-01, 18th International Conference on Machine Learning", publisher = "Morgan Kaufmann Publishers", address = "San Francisco", editor = "C. Brodley and A. Danyluk", pages = "250--257", year = "2001", url = "citeseer.ist.psu.edu/joachims01composite.html" } @misc{ rnkranz-hyperlink, author = "J. F{\"u}rnkranz", title = "Hyperlink Ensembles: A Case Study in Hypertext Classification", url = "citeseer.ist.psu.edu/578531.html" } @misc{ rnkranz-using, author = "Johannes F{\"u}rnkranz", title = "Using Links for Classifying Web-pages", url = "citeseer.ist.psu.edu/153148.html", year = "1998", note = "Technical Report TR-OEFAI-98-29, Austrian Research Institute for Artificial Intelligence" } @misc{apache04howto, author = "Apache Forrest Group, Apache Software Foundation", month = "December 5", year = "2004", url = "http://forrest.apache.org/howto/howto-howto.html" }