-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathhiero.conf
227 lines (192 loc) · 8.55 KB
/
hiero.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# this is an example Thrax configuration file
# <- this symbol indicates a comment
# each line should be a key-value pair separated by whitespace
#
# INPUT FILE OPTIONS
# ##################
# the unified corpus file.
# "unified" means it has this format on each line:
# source sentence ||| target sentence ||| alignments
#
# the sentences should be tokenized and normalized (and possibly parsed;
# see below). The alignments should be whitespace-separated int pairs, where
# the pair "i-j" means the ith source word is aligned to the jth target word
# (where the sentences are 0-indexed).
input-file example/europarl.unified.1
# if the source sentences of the corpus have been parsed, set this to true
source-is-parsed false
# if the target sentences of the corpus have been parsed, set this to true
target-is-parsed false
#
# GRAMMAR OPTIONS
# ###############
# The grammar type. Possible values:
# hiero -- a hierarchical phrase-based grammar with only one kind of
# non-terminal symbol (X)
# samt -- a hierarchical phrase-based grammar where the non-terminal symbols
# are derived from a syntax tree (this requires that either the
# source or target sentences in the corpus are parsed)
grammar hiero
# The minimum number of times we must extract a rule (counted over the entire
# corpus) before we keep it for the final grammar. Setting this value to 2
# removes a lot of bogus rules introduced by noise in the alignments.
min-rule-count 2
# If you're extracting an SAMT grammar, this key determines which sentence
# in each pair will be used to determine the syntactic labels. If true,
# target sentences will be used; if false, source sentences will be used.
# Note that the appropriate sentences have to be parsed in the input file.
#
# This key has no effect on Hiero grammars.
target-is-samt-syntax false
# What non-terminal symbol to use for Hiero grammars
# the default is X
default-nt X
# The goal symbol to be used in a glue grammar (this is only used if you
# run scripts/create_glue_grammar.sh
goal-symbol GOAL
# This sets the maximum number of words that a non-terminal symbol can span
# in a rule. This default setting is due to Chiang.
initial-phrase-length 10
# This sets the maximum number of non-terminals allowed on the right hand
# side of any rule. For example, setting this to 0 will extract only simple
# phrase-based rules (non-hierarchical). Most decoders expect binary rules at
# most, so you shouldn't normally set this higher than 2, but Thrax can extract
# rules with arbitrary arity.
arity 2
# This sets the minimum number of aligned words that must be present in any
# extracted rule.
lexicality 1
# If true, allow non-terminals to be adjacent on the source side of extracted
# rules. If false, such rules are disallowed. For example,
#
# [NP] ||| le [NN,1] [JJ,2] ||| the [JJ,2] [NN,1]
#
# would not be extracted unless this key is set to true.
adjacent-nts false
# If this is set to true, allow unaligned words at the inside edges of
# non-terminals in extracted rules. As an example, consider the sentence pair
#
# the 15th of september ||| 15 septembre ||| 0-0 1-0 3-1
#
# If loose is set to false, we cannot extract the rule
#
# [X] ||| [X,1] september ||| [X,1] septembre
#
# Even though [X,1] is consistent with the alignment, the word "of" is
# unaligned, so it is not allowed as the edge word of the nonterminal.
# If this key is set to true, we may extract the above rule.
loose false
# If this is true, we are allowed to extract purely abstract rules (that is,
# rules that have no terminal symbols) like
#
# [A] ||| [B,1] [C,2] ||| [C,2] [B,1]
allow-abstract-rules false
# If we are extracting an SAMT grammar, there's a chance that no syntactic
# label may be assigned to the left hand side of a rule. If that is the case,
# we have two options:
#
# If this key is false, we throw the rule away (don't extract it).
# If this key is true, and the rule is not purely lexical (has some non-
# terminals), we assign X (the default NT) to the left hand side.
allow-nonlexical-x false
# This sets the maximum number of symbols (terminal plus non-terminal) allowed
# on the source side of any rule that is not purely lexical (that means any
# rule that has at least one nonterminal).
nonlex-source-length 5
# This sets the maximum number of symbols (terminal plus non-terminal) allowed
# on the target side of a not-purely-lexical rule.
nonlex-target-length 5
# This sets the maximum number of terminal symbols allowed on the source side
# of a not-purely-lexical rule.
nonlex-source-words 5
# This sets the maximum number of terminal symbols allowed on the target side
# of a not-purely-lexical rule (a rule with non-terminal symbols).
nonlex-target-words 5
# This sets the maximum span (in words) of the source side of any rule.
rule-span-limit 12
# If this key is set to true, we can ignore rule-span-limit (above) in the case
# where a rule spans the entire sentence.
allow-full-sentence-rules false
# With SAMT, we run into an ambiguity in non-terminal label assignment if
# the training data parse trees have unary rules. This key determines how to
# handle a unary rule. Possible options:
#
# top: use the top-most label
# so if we have a unary chain A -> B -> C in the parse tree which spans our
# non-terminal, assign the label A.
#
# bottom: use the bottom-most label
# in the above example, we would assign label C.
#
# all:
# create a new label by concatenating all labels in the chain.
# In the above example, we use the label A:B:C.
unary-category-handler bottom
# In SAMT grammars, allow assignment of consitutent labels to non-terminals.
# This means, if a node of a parse tree spans the non-terminal of an extracted
# rule, we assign that node's label to the NT.
allow-constituent-label true
# In SAMT grammars, allow assignment of CCG-style labels to non-terminals.
# These are labels of the form A/B or A\B, indicating an A missing a B to its
# right or left, respectively.
allow-ccg-label true
# In SAMT grammars, allow assignment of concatenated labels to non-terminals.
# If two nodes A and B taken together span the non-terminal, we assign the
# label A+B.
allow-concat-label true
# In SAMT grammars, allow concatenation of three labels for non-terminals.
# This means assigning a label like A+B+C if, taken together, they span the
# non-terminal span of a rule.
allow-double-plus true
#
# RULE FEATURE OPTIONS
# ####################
# a whitespace seperated list of features. For a rule A -> (e,f) we define
# the following features:
# (note: all probabilities are negative logprobs base e)
#
# e2fphrase -- phrasal probability p(f|e)
# f2ephrase -- phrasal probability p(e|f)
# lexprob -- lexical weights p_lex(e|f) and p_lex(f|e)
# rarity -- rarity penalty exp(1 - C(A -> (e,f))), where C(.) is the total
# number of times the rule was seen in the corpus
# lexical -- 1 if the rule is purely lexical (no NTs), 0 otherwise
# abstract -- 1 if the rule is purely abstract (no terminals), 0 otherwise
# adjacent -- 1 if e (source) contains adjacent NTs, 0 otherwise
# x-rule -- 1 if the LHS nonterminal A is the default NT, 0 otherwise
# source-terminals-without-target -- 1 if e contains terminal symbols but f
# has none
# target-terminals-without-source -- 1 if f contains terminal symbols but e
# has none
# monotonic -- 1 if this rule has no re-ordering, 0 if it does have
# phrase-penalty -- a constant penalty feature
# glue-rule -- 1 if this rule is part of the glue grammar (never true for Thrax
# output), 0 otherwise
# target-word-count -- count of the number of terminals in f (target side)
# unaligned-count -- total # of unaligned words in this rule
features e2fphrase f2ephrase lexprob phrase-penalty
# The value of the constant phrase penalty feature.
phrase-penalty 2.718
#
# OUTPUT OPTIONS
# ##############
# Which decoder file format to use for the output. At the moment, there's
# only one option:
#
# joshua -- the format that is used for the Joshua decoder
output-format joshua
# If true, label each feature score in the output.
#
# Unlabeled output:
# [X] ||| the [X,1] ||| le [X,1] ||| 2.718 0.0 0.0
#
# Labeled output:
# [X] ||| the [X,1] ||| le [X,1] ||| PhrasePenalty=2.718 SourcePhraseGivenTarget=0.0 TargetPhraseGivenSource=0.0
label-feature-scores true
# If true, suppress the output of features whose scores are 0.
# If you set this to true, we recommend you set label-feature-scores to
# true also, so you know which features are present!
#
# Sparse, labeled output of rule above:
# [X] ||| the [X,1] ||| le [X,1] ||| PhrasePenalty=2.718
sparse-feature-vectors false