Skip to content

Commit

Permalink
升级lucene从5.2.0到5.2.1,升级elasticsearch从1.6.0到2.0.0-beta1
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed Aug 28, 2015
1 parent 98abc1b commit 06dba23
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 41 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@
<randomizedtesting.version>2.1.11</randomizedtesting.version>
<slf4j-api.version>1.6.4</slf4j-api.version>
<logback-classic.version>0.9.28</logback-classic.version>
<lucene.version>5.2.0</lucene.version>
<elasticsearch.version>1.6.0</elasticsearch.version>
<lucene.version>5.2.1</lucene.version>
<elasticsearch.version>2.0.0-beta1</elasticsearch.version>
<jedis.version>2.5.1</jedis.version>
<pinyin4j.version>2.5.0</pinyin4j.version>
</properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,24 @@
package org.apdplat.word.elasticsearch;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.*;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import java.io.Reader;
import java.util.Map;
import org.apdplat.word.lucene.ChineseWordAnalyzer;
import org.apdplat.word.lucene.ChineseWordTokenizer;
import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.SegmentationFactory;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Map;

/**
* 中文分词索引分析组件
* @author 杨尚川
Expand Down Expand Up @@ -105,7 +108,7 @@ public String name() {
return "word";
}
@Override
public Tokenizer create(Reader reader) {
public Tokenizer create() {
return new ChineseWordTokenizer(tokenizerSegmentation);
}
}));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,25 @@

package org.apdplat.word.elasticsearch;

import org.elasticsearch.common.collect.ImmutableList;
import org.elasticsearch.common.component.LifecycleComponent;
import org.elasticsearch.common.inject.Module;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.plugins.AbstractPlugin;
import org.elasticsearch.plugins.Plugin;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;

/**
* 中文分词组件(word)的ElasticSearch插件
* @author 杨尚川
*/
public class ChineseWordPlugin extends AbstractPlugin {
public class ChineseWordPlugin extends Plugin {
private final Settings settings;
public ChineseWordPlugin(Settings settings) {
this.settings = settings;
}
@Override
public String name() {
return "word";
Expand All @@ -40,8 +48,17 @@ public String description() {
return "中文分词组件(word)";
}
@Override
public Collection<Class<? extends Module>> modules() {
return ImmutableList.<Class<? extends Module>>of(ChineseWordIndicesAnalysisModule.class);
public Collection<Module> nodeModules() {
return Collections.<Module>singletonList(new ChineseWordIndicesAnalysisModule());
}
@Override
public Collection<Class<? extends LifecycleComponent>> nodeServices() {
Collection<Class<? extends LifecycleComponent>> services = new ArrayList<>();
return services;
}
@Override
public Collection<Module> indexModules(Settings indexSettings) {
return Collections.<Module>singletonList(new ChineseWordIndicesAnalysisModule());
}
public void onModule(AnalysisModule module) {
module.addProcessor(new ChineseWordAnalysisBinderProcessor());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,16 @@
package org.apdplat.word.elasticsearch;

import org.apache.lucene.analysis.Tokenizer;
import org.apdplat.word.lucene.ChineseWordTokenizer;
import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.SegmentationFactory;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import java.io.Reader;
import org.apdplat.word.lucene.ChineseWordTokenizer;
import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.SegmentationFactory;
import org.elasticsearch.index.settings.IndexSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -55,8 +54,9 @@ public ChineseWordTokenizerFactory(Index index, @IndexSettings Settings indexSet
segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
}
}

@Override
public Tokenizer create(Reader reader) {
public Tokenizer create() {
return new ChineseWordTokenizer(segmentation);
}
}
2 changes: 0 additions & 2 deletions src/main/resources/es-plugin.properties

This file was deleted.

9 changes: 9 additions & 0 deletions src/main/resources/plugin-descriptor.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
classname=org.apdplat.word.elasticsearch.ChineseWordPlugin
name=word
description=word
version=2.0.0-beta1
elasticsearch.version=2.0.0-beta1
isolated=true
site=false
jvm=true
java.version=1.8.0_45
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,32 @@

package org.apdplat.word.elasticsearch;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apdplat.word.lucene.ChineseWordAnalyzer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.junit.Test;

import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.TokenizerFactory;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

Expand All @@ -56,15 +54,18 @@
* @author 杨尚川
*/
public class ChineseWordIndicesAnalysisTest {
private static final Settings SETTINGS = ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
private static final Settings SETTINGS = Settings.settingsBuilder()
.put(IndexMetaData.SETTING_VERSION_CREATED, "2010099")
.put("path.home", "/Users/apple/elasticsearch-2.0.0-beta1")
.build();
@Test
public void testChineseWordIndicesAnalysis() throws IOException {
Index index = new Index("test");

Injector parentInjector = new ModulesBuilder()
.add(new SettingsModule(SETTINGS),
new EnvironmentModule(new Environment(SETTINGS)),
new IndicesAnalysisModule())
.add(new SettingsModule(SETTINGS),
new EnvironmentModule(new Environment(SETTINGS)),
new ChineseWordIndicesAnalysisModule())
.createInjector();

Injector injector = new ModulesBuilder().add(
Expand All @@ -79,8 +80,9 @@ public void testChineseWordIndicesAnalysis() throws IOException {
TokenizerFactory tokenizerFactory = analysisService.tokenizer("word");
boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory);
assertTrue(match);

Tokenizer tokenizer = tokenizerFactory.create(new StringReader("他说的确实在理"));

Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader("他说的确实在理"));
String exp = "[确实, 在理]";
List<String> result = new ArrayList<>();
while(tokenizer.incrementToken()){
Expand All @@ -96,10 +98,12 @@ public void testChineseWordIndicesAnalysis() throws IOException {
TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
exp = "[杨尚川, apdplat, 应用级, 产品开发, 平台, 作者]";
result = new ArrayList<>();
tokenizer.reset();
while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
result.add(charTermAttribute.toString());
}
tokenizer.close();
assertEquals(exp, result.toString());
}
}

0 comments on commit 06dba23

Please sign in to comment.