博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Lucene自定义同义词分词器
阅读量:4031 次
发布时间:2019-05-24

本文共 3169 字,大约阅读时间需要 10 分钟。

package com.lucene.util;
 
 import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 
 import com.chenlb.mmseg4j.Dictionary;
 import com.chenlb.mmseg4j.MaxWordSeg;
 import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
 
 public class MySameworkAnalyzer extends Analyzer {
 
     @Override
     public TokenStream tokenStream(String str, Reader reader) {
         //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
         Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
         return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
     }
 

 }

#####################

@Test

     public void test05(){
         try {
             Analyzer a1=new MySameworkAnalyzer();
             String str="我来自中国,我的名字叫什么";
             AnalyzerUtil.displayToken(str, a1);
             Directory directory=new RAMDirectory();
             IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
             Document document=new Document();
             document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
             indexWriter.addDocument(document);
             indexWriter.close();
             IndexReader indexReader=IndexReader.open(directory);
             IndexSearcher searcher=new IndexSearcher(indexReader);
             TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
             ScoreDoc[] docs=tds.scoreDocs;
             Document doc=searcher.doc(docs[0].doc);
             System.out.println(doc.get("content"));
             searcher.close();
             indexReader.close();
         } catch (CorruptIndexException e) {
             e.printStackTrace();
         } catch (LockObtainFailedException e) {
             e.printStackTrace();
         } catch (IOException e) {
             e.printStackTrace();
         }
     }

###############

package com.lucene.util;

 
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Stack;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.AttributeSource;
 
 public class MySameworkFilter extends TokenFilter  {
 
     //保存相应的词汇
     private CharTermAttribute cta=null;
     //保存词与词之间的位置增量
     private PositionIncrementAttribute pia=null;
     //定义一个状态
     private AttributeSource.State current=null;
     //用栈保存同义词集合
     private Stack<String> sames=null;
     protected MySameworkFilter(TokenStream input) {
         super(input);
         cta=this.addAttribute(CharTermAttribute.class);
         pia=this.addAttribute(PositionIncrementAttribute.class);
         sames=new Stack<String>();
     }
 
 
     @Override
     public boolean incrementToken() throws IOException {
         if(sames.size()>0){
             //将元素出栈,并获取同义词
             String str=sames.pop();
             //还原状态
             restoreState(current);
             //先清空,再添加
             cta.setEmpty();
             cta.append(str);
             //设置位置为0,表示同义词
             pia.setPositionIncrement(0);
             return true;
         }
         
         if(!this.input.incrementToken())
         return false;
         
         //如果改词中有同义词,捕获当前状态
         if(this.getSamewords(cta.toString())){
             current=captureState();
         }
         
         return true;
     }
 
     //定义同义词字典,并判断如果有同义词就返回true
     private boolean getSamewords(String key){
         Map<String, String[]> maps=new HashMap<String, String[]>();
         maps.put("我", new String[]{"咱","俺"});
         maps.put("中国", new String[]{"大陆","天朝"});
         
         if(maps.get(key)!=null){
             for(String s:maps.get(key)){
                 sames.push(s);
             }
         }
         
         if(sames.size()>0){
             return true;
         }
         return false;
     }
 
 }

转载地址:http://rnebi.baihongyu.com/

你可能感兴趣的文章
opencv test code-1
查看>>
eclipse 导入先前存在的项目
查看>>
GNU hello代码分析
查看>>
Qt继电器控制板代码
查看>>
busybox passwd修改密码
查看>>
wpa_supplicant控制脚本
查看>>
rfkill: WLAN hard blocked
查看>>
gstreamer相关工具集合
查看>>
arm 自动升级脚本
查看>>
RS232 四入四出模块控制代码
查看>>
gstreamer插件之 videotestsrc
查看>>
autoupdate script
查看>>
linux 驱动开发 头文件
查看>>
/etc/resolv.conf
查看>>
container_of()传入结构体中的成员,返回该结构体的首地址
查看>>
linux sfdisk partition
查看>>
ipconfig,ifconfig,iwconfig
查看>>
opensuse12.2 PL2303 minicom
查看>>
电平触发方式和边沿触发的区别
查看>>
网络视频服务器移植
查看>>