JavaScript must be enabled in order for you to view this page. However, it seems JavaScript is either disabled or not supported by your browser. To view this page, enable JavaScript by changing your browser options, then Try again! .

 
我的论坛
Google 网上论坛 Beta 版
Do Best Developer
访问此论坛

将MMSeg和Lucene集成示例

今天jeff给我发了一封Email,从中我发现了MMSeg的一个bug。目前该bug已经修复,同时我也重新发布了MMSeg,可以在这里找到。

Jeff还将MMSeg和Lucene集成到了一起,并将代码发给了我,还允许我放到这里供大家参考。感谢jeff的工作!

下面是具体的代码:

package net.jf.lucene.analysis.mmseg;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

public class MMSegAnalyzer extends Analyzer {
	public TokenStream tokenStream(String fieldName, Reader reader) {
		return new MMSegTokenizer(reader);
	}

	public static void main(String[] args) {
		String string = "到此为止这个简单的但是功能强大的分词器就写完了,下面咱们可以尝试写一个功能更强大的分词器.";
		//String string = "我们的祖国是花园,花园的花朵真鲜艳。";
		//String string = "研究生命起源";
		Analyzer analyzer = new MMSegAnalyzer();
		TokenStream ts = analyzer
				.tokenStream("dummy", new StringReader(string));
		Token token;
		System.out.println("Tokens:");
		try {
			int n = 0;
			while ((token = ts.next()) != null)
			{
				System.out.println((n++) + "->" + token.toString());
			}
		}
		catch (IOException ioe)
		{
			ioe.printStackTrace();
		}
	}
}
package net.jf.lucene.analysis.mmseg;

import java.io.IOException;
import java.io.Reader;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.solol.mmseg.core.AlgorithmException;
import org.solol.mmseg.core.AlgorithmFactory;
import org.solol.mmseg.core.Config;
import org.solol.mmseg.core.ConfigurationException;
import org.solol.mmseg.core.IAlgorithm;
import org.solol.mmseg.core.IWord;

public class MMSegTokenizer extends Tokenizer{
	
	private String type = Config.THREE_WORD;
	
	private String tokenType = "word";
	
	private AlgorithmFactory algorithmFactory;
	
	private IAlgorithm algorithm;
	
	private int index = -1;
	
	private boolean debug = false;
	
	private char[] chs;
	
	public void setType(String type) {
		this.type = type;
	}

	public void setAlgorithmFactory(AlgorithmFactory algorithmFactory) {
		this.algorithmFactory = algorithmFactory;
	}

	public MMSegTokenizer(Reader input) {
		setInput(input);
		setAlgorithm(type);
	}
	
	public MMSegTokenizer(Reader input,String type) {
		setInput(input);
		this.type = type;
		setAlgorithm(type);
	}
	
	private void setInput(Reader input){
		this.input = input;
		try {
			chs = IOUtils.toCharArray(input);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	private void setAlgorithm(String type){
		type = type == null ? this.type : type;
		try {
			algorithm = getAlgorithmFactory().createAlgorithm(type);
		} catch (AlgorithmException e) {
			e.printStackTrace();
		}
	}

	private AlgorithmFactory getAlgorithmFactory() {
		if(null == algorithmFactory)
			try {
				algorithmFactory = AlgorithmFactory.getFactory();
			} catch (ConfigurationException e) {
				throw new RuntimeException(e);
			}
		return algorithmFactory;
	}


	public Token next() throws IOException {
		if(index == -1){
			debug("start up");
			index = 0;
		}
		try {
			IWord word = algorithm.next(chs);
			if(null == word)
				return null;
			Token token = new Token(word.getValue(),index,index + word.getLength(),tokenType);
			index += word.getLength();
			return token;
		} catch (AlgorithmException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	private void debug(String info){
		if(debug)
			System.out.println(info);
	}

}	

代码运行的结果是:

Tokens:
0->(到此为止,0,4)
1->(这个,4,6)
2->(简单,6,8)
3->(的,8,9)
4->(但是,9,11)
5->(功能,11,13)
6->(强大,13,15)
7->(的,15,16)
8->(分词,16,18)
9->(器,18,19)
10->(就,19,20)
11->(写完,20,22)
12->(了,22,23)
13->(,,23,24)
14->(下面,24,26)
15->(咱们,26,28)
16->(可以,28,30)
17->(尝试,30,32)
18->(写,32,33)
19->(一个,33,35)
20->(功能,35,37)
21->(更,37,38)
22->(强大,38,40)
23->(的,40,41)
24->(分词,41,43)
25->(器,43,44)
26->(.,44,45)