您好,欢迎来到爱go旅游网。
搜索
您的当前位置:首页Lucence自定义分词器

Lucence自定义分词器

来源:爱go旅游网
package org.lucene.util;


import java.io.Reader;
import java.util.Set;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;


/**
 * 停用词分词器
 * @author 
 *
 */
public class MyStopAnalyzer extends Analyzer {
@SuppressWarnings("rawtypes")
private Set stops;
@SuppressWarnings("unchecked")
public MyStopAnalyzer(String[] sws) {
//会自动将字符串数组转换为Set
stops = StopFilter.makeStopSet(Version.LUCENE_4_9, sws, true);
//将原有的停用词加入到现有的停用词中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

public MyStopAnalyzer() {
//获取原有的停用词
stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}


/**
public TokenStream tokenStream(String fieldName, Reader reader) {
//为这个分词器设置过滤链和Tokenizer
//使用的Tokenizer是 LetterTokenizer,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
return new StopFilter(Version.LUCENE_4_9,
  new LowerCaseFilter(Version.LUCENE_4_9, 
  new LetterTokenizer(Version.LUCENE_4_9,reader)), new CharArraySet(Version.LUCENE_4_9,stops,true));
}
    */
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//创建Tokenizer
Tokenizer  tokenizer=new LetterTokenizer(Version.LUCENE_4_9,reader);
//创建过滤器链,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
TokenStream ts= new StopFilter(Version.LUCENE_4_9,
  new LowerCaseFilter(Version.LUCENE_4_9, tokenizer),new CharArraySet(Version.LUCENE_4_9,stops,true));
//创建TokenStreamComponents
TokenStreamComponents  tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}



@Test
public void test04() {
Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate","how"});
Analyzer a2 = new MyStopAnalyzer();
String txt = "how are you thank you I hate you";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
}


}

因篇幅问题不能全部显示,请点此查看更多更全内容

Copyright © 2019- igat.cn 版权所有 赣ICP备2024042791号-1

违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com

本站由北京市万商天勤律师事务所王兴未律师提供法律服务