@inproceedings{a47f7477fd8d457caba688af5c612089,
title = "Verifying a Chinese collection for text categorization",
abstract = "This article describes the development of a free test collection for Chinese text categorization. A novel retrieval-based approach was developed to detect duplicates and label inconsistency in this corpus and in Reuters-21578 for comparison. The method was able to detect certain types of similar and/or duplicated documents that were overlooked by an alternative repetition-based method. Experiments showed that effectiveness was not affected by the confusing documents.",
keywords = "Chinese collection, Consistency verification, Duplicate detection",
author = "Tseng, {Yuen Hsien} and Teahan, {William John}",
year = "2004",
doi = "10.1145/1008992.1009118",
language = "English",
isbn = "1581138814",
series = "Proceedings of Sheffield SIGIR - Twenty-Seventh Annual International ACM SIGIR Conference on Research and Development in Information Retrieval",
publisher = "Association for Computing Machinery (ACM)",
pages = "556--557",
booktitle = "Proceedings of Sheffield SIGIR - Twenty-Seventh Annual International ACM SIGIR Conference on Research and Development in Information Retrieval",
address = "United States",
note = "Proceedings of Sheffield SIGIR - Twenty-Seventh Annual International ACM SIGIR Conference on Research and Development in Information Retrieval ; Conference date: 25-07-2004 Through 29-07-2004",
}