今天jeff给我发了一封Email,从中我发现了MMSeg的一个bug。目前该bug已经修复,同时我也重新发布了MMSeg,可以在这里找到。
Jeff还将MMSeg和Lucene集成到了一起,并将代码发给了我,还允许我放到这里供大家参考。感谢jeff的工作!
下面是具体的代码:
package net.jf.lucene.analysis.mmseg;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class MMSegAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new MMSegTokenizer(reader);
}
public static void main(String[] args) {
String string = "到此为止这个简单的但是功能强大的分词器就写完了,下面咱们可以尝试写一个功能更强大的分词器.";
//String string = "我们的祖国是花园,花园的花朵真鲜艳。";
//String string = "研究生命起源";
Analyzer analyzer = new MMSegAnalyzer();
TokenStream ts = analyzer
.tokenStream("dummy", new StringReader(string));
Token token;
System.out.println("Tokens:");
try {
int n = 0;
while ((token = ts.next()) != null)
{
System.out.println((n++) + "->" + token.toString());
}
}
catch (IOException ioe)
{
ioe.printStackTrace();
}
}
}
package net.jf.lucene.analysis.mmseg;
import java.io.IOException;
import java.io.Reader;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.solol.mmseg.core.AlgorithmException;
import org.solol.mmseg.core.AlgorithmFactory;
import org.solol.mmseg.core.Config;
import org.solol.mmseg.core.ConfigurationException;
import org.solol.mmseg.core.IAlgorithm;
import org.solol.mmseg.core.IWord;
public class MMSegTokenizer extends Tokenizer{
private String type = Config.THREE_WORD;
private String tokenType = "word";
private AlgorithmFactory algorithmFactory;
private IAlgorithm algorithm;
private int index = -1;
private boolean debug = false;
private char[] chs;
public void setType(String type) {
this.type = type;
}
public void setAlgorithmFactory(AlgorithmFactory algorithmFactory) {
this.algorithmFactory = algorithmFactory;
}
public MMSegTokenizer(Reader input) {
setInput(input);
setAlgorithm(type);
}
public MMSegTokenizer(Reader input,String type) {
setInput(input);
this.type = type;
setAlgorithm(type);
}
private void setInput(Reader input){
this.input = input;
try {
chs = IOUtils.toCharArray(input);
} catch (IOException e) {
e.printStackTrace();
}
}
private void setAlgorithm(String type){
type = type == null ? this.type : type;
try {
algorithm = getAlgorithmFactory().createAlgorithm(type);
} catch (AlgorithmException e) {
e.printStackTrace();
}
}
private AlgorithmFactory getAlgorithmFactory() {
if(null == algorithmFactory)
try {
algorithmFactory = AlgorithmFactory.getFactory();
} catch (ConfigurationException e) {
throw new RuntimeException(e);
}
return algorithmFactory;
}
public Token next() throws IOException {
if(index == -1){
debug("start up");
index = 0;
}
try {
IWord word = algorithm.next(chs);
if(null == word)
return null;
Token token = new Token(word.getValue(),index,index + word.getLength(),tokenType);
index += word.getLength();
return token;
} catch (AlgorithmException e) {
e.printStackTrace();
}
return null;
}
private void debug(String info){
if(debug)
System.out.println(info);
}
}
代码运行的结果是:
Tokens: 0->(到此为止,0,4) 1->(这个,4,6) 2->(简单,6,8) 3->(的,8,9) 4->(但是,9,11) 5->(功能,11,13) 6->(强大,13,15) 7->(的,15,16) 8->(分词,16,18) 9->(器,18,19) 10->(就,19,20) 11->(写完,20,22) 12->(了,22,23) 13->(,,23,24) 14->(下面,24,26) 15->(咱们,26,28) 16->(可以,28,30) 17->(尝试,30,32) 18->(写,32,33) 19->(一个,33,35) 20->(功能,35,37) 21->(更,37,38) 22->(强大,38,40) 23->(的,40,41) 24->(分词,41,43) 25->(器,43,44) 26->(.,44,45)