本文共 3169 字,大约阅读时间需要 10 分钟。
package com.lucene.util; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MaxWordSeg; import com.chenlb.mmseg4j.analysis.MMSegTokenizer; public class MySameworkAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String str, Reader reader) { //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器 Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data"); return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)); }}
#####################
@Test
public void test05(){ try { Analyzer a1=new MySameworkAnalyzer(); String str="我来自中国,我的名字叫什么"; AnalyzerUtil.displayToken(str, a1); Directory directory=new RAMDirectory(); IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1)); Document document=new Document(); document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED)); indexWriter.addDocument(document); indexWriter.close(); IndexReader indexReader=IndexReader.open(directory); IndexSearcher searcher=new IndexSearcher(indexReader); TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10); ScoreDoc[] docs=tds.scoreDocs; Document doc=searcher.doc(docs[0].doc); System.out.println(doc.get("content")); searcher.close(); indexReader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }###############
package com.lucene.util;
import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Stack; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; public class MySameworkFilter extends TokenFilter { //保存相应的词汇 private CharTermAttribute cta=null; //保存词与词之间的位置增量 private PositionIncrementAttribute pia=null; //定义一个状态 private AttributeSource.State current=null; //用栈保存同义词集合 private Stack<String> sames=null; protected MySameworkFilter(TokenStream input) { super(input); cta=this.addAttribute(CharTermAttribute.class); pia=this.addAttribute(PositionIncrementAttribute.class); sames=new Stack<String>(); } @Override public boolean incrementToken() throws IOException { if(sames.size()>0){ //将元素出栈,并获取同义词 String str=sames.pop(); //还原状态 restoreState(current); //先清空,再添加 cta.setEmpty(); cta.append(str); //设置位置为0,表示同义词 pia.setPositionIncrement(0); return true; } if(!this.input.incrementToken()) return false; //如果改词中有同义词,捕获当前状态 if(this.getSamewords(cta.toString())){ current=captureState(); } return true; } //定义同义词字典,并判断如果有同义词就返回true private boolean getSamewords(String key){ Map<String, String[]> maps=new HashMap<String, String[]>(); maps.put("我", new String[]{"咱","俺"}); maps.put("中国", new String[]{"大陆","天朝"}); if(maps.get(key)!=null){ for(String s:maps.get(key)){ sames.push(s); } } if(sames.size()>0){ return true; } return false; } }转载地址:http://rnebi.baihongyu.com/