build.gradle

apply plugin: 'java'
apply plugin: 'maven'

group = 'org.apdplat'
version = '1.1'

description = 'word分词是一个Java实现的中文分词组件，提供了多种基于词典的分词算法，并利用ngram模型来消除歧义。能准确识别英文、数字，以及日期、时间等数量词，能识别人名、地名、组织机构名等未登录词。同时提供了Lucene、Solr、ElasticSearch插件。'

sourceCompatibility = 1.8
targetCompatibility = 1.8

repositories {
    mavenCentral()
}

dependencies {
    testCompile (
    	'junit:junit:4.11',
    	'org.hamcrest:hamcrest-library:1.3',
    	'org.apache.lucene:lucene-test-framework:5.2.0',
    	'org.apache.lucene:lucene-queryparser:5.2.0',
    	'org.elasticsearch:elasticsearch:1.6.0:tests',
    	'com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.11'
    )

    compile (
    	'org.slf4j:slf4j-api:1.6.4',
    	'org.apache.lucene:lucene-core:5.2.0',
    	'org.apache.lucene:lucene-analyzers-common:5.2.0',
    	'org.elasticsearch:elasticsearch:1.6.0',
    	'redis.clients:jedis:2.5.1',
        'com.belerweb:pinyin4j:2.5.0'
    )

    runtime ('ch.qos.logback:logback-classic:0.9.28') {
	exclude group: 'commons-logging', module: 'commons-logging'
    }
}

jar {
    exclude('**/org/apdplat/word/corpus/Corpus*')
    exclude('**/corpus/corpora.zip')
    exclude('**/corpus')
    exclude('**/logback.xml')
}

//分词效果演示
task wordDemo(type: JavaExec) {
       dependsOn classes
       description = 'Run org.apdplat.word.WordSegmenter'
       // Java main class to execute.
       main = 'org.apdplat.word.WordSegmenter'
       // We need to set the classpath.
       classpath sourceSets.main.runtimeClasspath
       // Extra options can be set.
       maxHeapSize = '1200m'
       jvmArgs '-client'
       // We can pass arguments to the main() method
       // of org.apdplat.word.WordSegmenter.
       args 'demo'
}

//1、从语料库中提取词，词之间以空格分隔，内容保存到文件build/word.txt
task extractText(type: JavaExec) {
       dependsOn classes
       description = 'Run org.apdplat.word.corpus.ExtractText'
       // Java main class to execute.
       main = 'org.apdplat.word.corpus.ExtractText'
       // We need to set the classpath.
       classpath sourceSets.main.runtimeClasspath
       // Extra options can be set.
       maxHeapSize = '12000m'
       jvmArgs '-client'
       // We can pass arguments to the main() method
       // of org.apdplat.word.corpus.ExtractText.
       args 'build/word.txt'
}

//2、对分好词的文件build/word.txt建立词向量，将词向量保存到文件build/vector.txt，将词汇表保存到文件build/vocabulary.txt
task word2Vector(type: JavaExec) {
       dependsOn extractText
       description = 'Run org.apdplat.word.vector.Word2Vector'
       // Java main class to execute.
       main = 'org.apdplat.word.vector.Word2Vector'
       // We need to set the classpath.
       classpath sourceSets.main.runtimeClasspath
       // Extra options can be set.
       maxHeapSize = '12000m'
       jvmArgs '-client'
       // We can pass arguments to the main() method
       // of org.apdplat.word.vector.Word2Vector.
       args 'build/word.txt', 'build/vector.txt', 'build/vocabulary.txt', '2', '30'
}

//3、计算不同词向量之间的相似度，控制台编码为UTF-8
task wordVectorDemo(type: JavaExec) {
       dependsOn word2Vector
       description = 'Run org.apdplat.word.vector.Distance'
       // Java main class to execute.
       main = 'org.apdplat.word.vector.Distance'
       // We need to set the classpath.
       classpath sourceSets.main.runtimeClasspath
       // Extra options can be set.
       maxHeapSize = '12000m'
       jvmArgs '-client'
       // We can pass arguments to the main() method
       // of org.apdplat.word.vector.Distance.
       args 'build/vector.txt', 'utf-8'
}

//各种分词算法的精度和速度评估
task evaluation(type: JavaExec) {
       dependsOn classes
       description = 'Run org.apdplat.word.corpus.Evaluation'
       // Java main class to execute.
       main = 'org.apdplat.word.corpus.Evaluation'
       // We need to set the classpath.
       classpath sourceSets.main.runtimeClasspath
       // Extra options can be set.
       maxHeapSize = '1200m'
       jvmArgs '-client'
}