forked from ysc/word
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.gradle
122 lines (109 loc) · 4.23 KB
/
build.gradle
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
apply plugin: 'java'
apply plugin: 'maven'
group = 'org.apdplat'
version = '1.1'
description = 'word分词是一个Java实现的中文分词组件,提供了多种基于词典的分词算法,并利用ngram模型来消除歧义。能准确识别英文、数字,以及日期、时间等数量词,能识别人名、地名、组织机构名等未登录词。同时提供了Lucene、Solr、ElasticSearch插件。'
sourceCompatibility = 1.8
targetCompatibility = 1.8
repositories {
mavenCentral()
}
dependencies {
testCompile (
'junit:junit:4.11',
'org.hamcrest:hamcrest-library:1.3',
'org.apache.lucene:lucene-test-framework:5.2.0',
'org.apache.lucene:lucene-queryparser:5.2.0',
'org.elasticsearch:elasticsearch:1.6.0:tests',
'com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.11'
)
compile (
'org.slf4j:slf4j-api:1.6.4',
'org.apache.lucene:lucene-core:5.2.0',
'org.apache.lucene:lucene-analyzers-common:5.2.0',
'org.elasticsearch:elasticsearch:1.6.0',
'redis.clients:jedis:2.5.1',
'com.belerweb:pinyin4j:2.5.0'
)
runtime ('ch.qos.logback:logback-classic:0.9.28') {
exclude group: 'commons-logging', module: 'commons-logging'
}
}
jar {
exclude('**/org/apdplat/word/corpus/Corpus*')
exclude('**/corpus/corpora.zip')
exclude('**/corpus')
exclude('**/logback.xml')
}
//分词效果演示
task wordDemo(type: JavaExec) {
dependsOn classes
description = 'Run org.apdplat.word.WordSegmenter'
// Java main class to execute.
main = 'org.apdplat.word.WordSegmenter'
// We need to set the classpath.
classpath sourceSets.main.runtimeClasspath
// Extra options can be set.
maxHeapSize = '1200m'
jvmArgs '-client'
// We can pass arguments to the main() method
// of org.apdplat.word.WordSegmenter.
args 'demo'
}
//1、从语料库中提取词,词之间以空格分隔,内容保存到文件build/word.txt
task extractText(type: JavaExec) {
dependsOn classes
description = 'Run org.apdplat.word.corpus.ExtractText'
// Java main class to execute.
main = 'org.apdplat.word.corpus.ExtractText'
// We need to set the classpath.
classpath sourceSets.main.runtimeClasspath
// Extra options can be set.
maxHeapSize = '12000m'
jvmArgs '-client'
// We can pass arguments to the main() method
// of org.apdplat.word.corpus.ExtractText.
args 'build/word.txt'
}
//2、对分好词的文件build/word.txt建立词向量,将词向量保存到文件build/vector.txt,将词汇表保存到文件build/vocabulary.txt
task word2Vector(type: JavaExec) {
dependsOn extractText
description = 'Run org.apdplat.word.vector.Word2Vector'
// Java main class to execute.
main = 'org.apdplat.word.vector.Word2Vector'
// We need to set the classpath.
classpath sourceSets.main.runtimeClasspath
// Extra options can be set.
maxHeapSize = '12000m'
jvmArgs '-client'
// We can pass arguments to the main() method
// of org.apdplat.word.vector.Word2Vector.
args 'build/word.txt', 'build/vector.txt', 'build/vocabulary.txt', '2', '30'
}
//3、计算不同词向量之间的相似度,控制台编码为UTF-8
task wordVectorDemo(type: JavaExec) {
dependsOn word2Vector
description = 'Run org.apdplat.word.vector.Distance'
// Java main class to execute.
main = 'org.apdplat.word.vector.Distance'
// We need to set the classpath.
classpath sourceSets.main.runtimeClasspath
// Extra options can be set.
maxHeapSize = '12000m'
jvmArgs '-client'
// We can pass arguments to the main() method
// of org.apdplat.word.vector.Distance.
args 'build/vector.txt', 'utf-8'
}
//各种分词算法的精度和速度评估
task evaluation(type: JavaExec) {
dependsOn classes
description = 'Run org.apdplat.word.corpus.Evaluation'
// Java main class to execute.
main = 'org.apdplat.word.corpus.Evaluation'
// We need to set the classpath.
classpath sourceSets.main.runtimeClasspath
// Extra options can be set.
maxHeapSize = '1200m'
jvmArgs '-client'
}