example/hiero.conf

# this is an example Thrax configuration file
# <- this symbol indicates a comment
# each line should be a key-value pair separated by whitespace

# 
# INPUT FILE OPTIONS
# ##################

# the unified corpus file.
# "unified" means it has this format on each line:
# source sentence ||| target sentence ||| alignments
#
# the sentences should be tokenized and normalized (and possibly parsed;
# see below). The alignments should be whitespace-separated int pairs, where
# the pair "i-j" means the ith source word is aligned to the jth target word
# (where the sentences are 0-indexed).
input-file  example/europarl.unified.1

# if the source sentences of the corpus have been parsed, set this to true
source-is-parsed    false
# if the target sentences of the corpus have been parsed, set this to true
target-is-parsed    false

#
# GRAMMAR OPTIONS
# ###############

# The grammar type. Possible values:
# hiero -- a hierarchical phrase-based grammar with only one kind of 
#          non-terminal symbol (X)
# samt -- a hierarchical phrase-based grammar where the non-terminal symbols
#         are derived from a syntax tree (this requires that either the
#         source or target sentences in the corpus are parsed)
grammar     hiero   

# The minimum number of times we must extract a rule (counted over the entire
# corpus) before we keep it for the final grammar. Setting this value to 2
# removes a lot of bogus rules introduced by noise in the alignments.
min-rule-count  2

# If you're extracting an SAMT grammar, this key determines which sentence
# in each pair will be used to determine the syntactic labels. If true,
# target sentences will be used; if false, source sentences will be used.
# Note that the appropriate sentences have to be parsed in the input file.
#
# This key has no effect on Hiero grammars.
target-is-samt-syntax   false

# What non-terminal symbol to use for Hiero grammars
# the default is X
default-nt    X   

# The goal symbol to be used in a glue grammar (this is only used if you
# run scripts/create_glue_grammar.sh
goal-symbol GOAL

# This sets the maximum number of words that a non-terminal symbol can span
# in a rule. This default setting is due to Chiang.
initial-phrase-length   10  

# This sets the maximum number of non-terminals allowed on the right hand
# side of any rule. For example, setting this to 0 will extract only simple
# phrase-based rules (non-hierarchical). Most decoders expect binary rules at
# most, so you shouldn't normally set this higher than 2, but Thrax can extract
# rules with arbitrary arity.
arity                   2   

# This sets the minimum number of aligned words that must be present in any
# extracted rule.
lexicality              1

# If true, allow non-terminals to be adjacent on the source side of extracted
# rules. If false, such rules are disallowed. For example,
#
# [NP] ||| le [NN,1] [JJ,2] ||| the [JJ,2] [NN,1]
#
# would not be extracted unless this key is set to true.
adjacent-nts    false   

# If this is set to true, allow unaligned words at the inside edges of
# non-terminals in extracted rules. As an example, consider the sentence pair
#
# the 15th of september ||| 15 septembre ||| 0-0 1-0 3-1
#
# If loose is set to false, we cannot extract the rule
#
# [X] ||| [X,1] september ||| [X,1] septembre
#
# Even though [X,1] is consistent with the alignment, the word "of" is 
# unaligned, so it is not allowed as the edge word of the nonterminal.
# If this key is set to true, we may extract the above rule.
loose           false   

# If this is true, we are allowed to extract purely abstract rules (that is,
# rules that have no terminal symbols) like
#
# [A] ||| [B,1] [C,2] ||| [C,2] [B,1] 
allow-abstract-rules    false

# If we are extracting an SAMT grammar, there's a chance that no syntactic
# label may be assigned to the left hand side of a rule. If that is the case,
# we have two options:
# 
# If this key is false, we throw the rule away (don't extract it).
# If this key is true, and the rule is not purely lexical (has some non-
# terminals), we assign X (the default NT) to the left hand side.
allow-nonlexical-x      false

# This sets the maximum number of symbols (terminal plus non-terminal) allowed
# on the source side of any rule that is not purely lexical (that means any
# rule that has at least one nonterminal).
nonlex-source-length    5

# This sets the maximum number of symbols (terminal plus non-terminal) allowed
# on the target side of a not-purely-lexical rule.
nonlex-target-length    5

# This sets the maximum number of terminal symbols allowed on the source side
# of a not-purely-lexical rule.
nonlex-source-words     5

# This sets the maximum number of terminal symbols allowed on the target side
# of a not-purely-lexical rule (a rule with non-terminal symbols).
nonlex-target-words     5

# This sets the maximum span (in words) of the source side of any rule.
rule-span-limit         12

# If this key is set to true, we can ignore rule-span-limit (above) in the case
# where a rule spans the entire sentence.
allow-full-sentence-rules   false

# With SAMT, we run into an ambiguity in non-terminal label assignment if
# the training data parse trees have unary rules. This key determines how to
# handle a unary rule. Possible options:
#
# top: use the top-most label
# so if we have a unary chain A -> B -> C in the parse tree which spans our
# non-terminal, assign the label A.
#
# bottom: use the bottom-most label
# in the above example, we would assign label C.
#
# all:
# create a new label by concatenating all labels in the chain.
# In the above example, we use the label A:B:C.
unary-category-handler  bottom

# In SAMT grammars, allow assignment of consitutent labels to non-terminals.
# This means, if a node of a parse tree spans the non-terminal of an extracted
# rule, we assign that node's label to the NT.
allow-constituent-label true

# In SAMT grammars, allow assignment of CCG-style labels to non-terminals.
# These are labels of the form A/B or A\B, indicating an A missing a B to its
# right or left, respectively.
allow-ccg-label true

# In SAMT grammars, allow assignment of concatenated labels to non-terminals.
# If two nodes A and B taken together span the non-terminal, we assign the
# label A+B.
allow-concat-label  true

# In SAMT grammars, allow concatenation of three labels for non-terminals.
# This means assigning a label like A+B+C if, taken together, they span the
# non-terminal span of a rule.
allow-double-plus   true

#
# RULE FEATURE OPTIONS
# ####################

# a whitespace seperated list of features. For a rule A -> (e,f) we define
# the following features:
# (note: all probabilities are negative logprobs base e)
#
# e2fphrase -- phrasal probability p(f|e)
# f2ephrase -- phrasal probability p(e|f)
# lexprob -- lexical weights p_lex(e|f) and p_lex(f|e)
# rarity -- rarity penalty exp(1 - C(A -> (e,f))), where C(.) is the total 
# number of times the rule was seen in the corpus
# lexical -- 1 if the rule is purely lexical (no NTs), 0 otherwise
# abstract -- 1 if the rule is purely abstract (no terminals), 0 otherwise
# adjacent -- 1 if e (source) contains adjacent NTs, 0 otherwise
# x-rule -- 1 if the LHS nonterminal A is the default NT, 0 otherwise
# source-terminals-without-target -- 1 if e contains terminal symbols but f
# has none
# target-terminals-without-source -- 1 if f contains terminal symbols but e
# has none
# monotonic -- 1 if this rule has no re-ordering, 0 if it does have
# phrase-penalty -- a constant penalty feature
# glue-rule -- 1 if this rule is part of the glue grammar (never true for Thrax
# output), 0 otherwise
# target-word-count -- count of the number of terminals in f (target side)
# unaligned-count -- total # of unaligned words in this rule
features        e2fphrase f2ephrase lexprob phrase-penalty

# The value of the constant phrase penalty feature.
phrase-penalty  2.718

#
# OUTPUT OPTIONS
# ##############

# Which decoder file format to use for the output. At the moment, there's
# only one option:
#
# joshua -- the format that is used for the Joshua decoder
output-format   joshua  

# If true, label each feature score in the output.
#
# Unlabeled output:
# [X] ||| the [X,1] ||| le [X,1] ||| 2.718 0.0 0.0
#
# Labeled output:
# [X] ||| the [X,1] ||| le [X,1] ||| PhrasePenalty=2.718 SourcePhraseGivenTarget=0.0 TargetPhraseGivenSource=0.0 
label-feature-scores true  

# If true, suppress the output of features whose scores are 0.
# If you set this to true, we recommend you set label-feature-scores to
# true also, so you know which features are present!
#
# Sparse, labeled output of rule above:
# [X] ||| the [X,1] ||| le [X,1] ||| PhrasePenalty=2.718 
sparse-feature-vectors  false