java中文分词代码.docx
文本预览下载声明
/*?* created by yzh 2004.5.12?* 请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制,欢迎大家采用本人所写JS动态拖动表格实现代码。?* 中文分词代码?*此代码为作者多年经验总结,以前发表过VB,PB版本*/import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.Locale;import java.util.TreeMap;import java.util.TreeSet;public class ChineseSegmenter { private static ChineseSegmenter segmenter = null; // private Hashtable zhwords; private TreeMap zhwords; private TreeSet cforeign, cnumbers; // Char form public final static int TRAD = 0; public final static int SIMP = 1; public final static int BOTH = 2; // Charform is TRAD, SIMP or BOTH private ChineseSegmenter(int charform, boolean loadwordfile) { cforeign = new TreeSet(); cnumbers = new TreeSet(); if (charform == SIMP) { loadset(cnumbers, data/snumbers_u8.txt); loadset(cforeign, data/sforeign_u8.txt); } else if (charform == TRAD) { loadset(cnumbers, data/tnumbers_u8.txt); loadset(cforeign, data/tforeign_u8.txt); } else { // BOTH loadset(cnumbers, data/snumbers_u8.txt); loadset(cforeign, data/sforeign_u8.txt); loadset(cnumbers, data/tnumbers_u8.txt); loadset(cforeign, data/tforeign_u8.txt); } // zhwords = new Hashtable(120000); zhwords = new TreeMap(); if (!loadwordfile) { return; } String newword = null; try { InputStream worddata = null; if (charform == SIMP) { worddata = getClass().getResourceAsStream(simplexu8.txt); } else if (charform == TRAD) { worddata = getClass().getResourceAsStream(tradlexu8.txt); } else if (charform == BOTH) { worddata = getClass().getResourceAsStream(bothlexu8.txt); } BufferedReader in = new BufferedReader(new InputStreamReader( worddata, UTF8)); while ((newword = in.readLine()) != null) { if ((newword.indexOf(#) == -1) (newword.length() 5)) { zhwords.put(newword.intern(), 1); if (newword.length() == 3) { if (zhwords.containsKey(newword.substring(0, 2) .intern()) == false) { zhwords.put(newword.substring(0, 2).intern(), 2); } } if (newword.length() == 4) { if (zhwords.containsKey(newword.sub
显示全部