Skip to content

Commit

Permalink
Support symlinking to a full installation of the Penn Treebank via th…
Browse files Browse the repository at this point in the history
…e ptb module (and companion modules propbank_ptb and nombank_ptb).
  • Loading branch information
nschneid committed Jun 18, 2012
1 parent c99b40f commit f730bc1
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 6 deletions.
16 changes: 13 additions & 3 deletions nltk/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@
cat_file='cats.txt', textid_file='textids.txt')
ppattach = LazyCorpusLoader(
'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
# ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
# 'ptb3', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
# cat_file='allcats.txt', tag_mapping_function=simplify_wsj_tag)
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
'ptb3', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
cat_file='allcats.txt', tag_mapping_function=simplify_wsj_tag)
qc = LazyCorpusLoader(
'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'])
reuters = LazyCorpusLoader(
Expand Down Expand Up @@ -228,6 +228,16 @@
'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: filename.upper(),
ptb) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
'nombank.1.0', NombankCorpusReader,
'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: filename.upper(),
ptb) # Must be defined *after* ptb corpus.

def demo():
# This is out-of-date:
Expand Down
1 change: 1 addition & 0 deletions nltk/corpus/reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
'MacMorphoCorpusReader', 'SyntaxCorpusReader',
'AlpinoCorpusReader', 'RTECorpusReader',
'StringCategoryCorpusReader','EuroparlCorpusReader',
'CategorizedBracketParseCorpusReader',
'CategorizedTaggedCorpusReader',
'CategorizedPlaintextCorpusReader',
'PortugueseCategorizedPlaintextCorpusReader',
Expand Down
57 changes: 57 additions & 0 deletions nltk/corpus/reader/bracket_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,63 @@ def _tag(self, t, simplify_tags=False):
def _word(self, t):
return WORD.findall(self._normalize(t))

class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
BracketParseCorpusReader):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <[email protected]>
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}. The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
"""
CategorizedCorpusReader.__init__(self, kwargs)
BracketParseCorpusReader.__init__(self, *args, **kwargs)

def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return BracketParseCorpusReader.raw(
self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return BracketParseCorpusReader.words(
self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return BracketParseCorpusReader.sents(
self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return BracketParseCorpusReader.paras(
self, self._resolve(fileids, categories))
def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
return BracketParseCorpusReader.tagged_words(
self, self._resolve(fileids, categories), simplify_tags)
def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
return BracketParseCorpusReader.tagged_sents(
self, self._resolve(fileids, categories), simplify_tags)
def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
return BracketParseCorpusReader.tagged_paras(
self, self._resolve(fileids, categories), simplify_tags)
def parsed_words(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_words(
self, self._resolve(fileids, categories))
def parsed_sents(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_sents(
self, self._resolve(fileids, categories))
def parsed_paras(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_paras(
self, self._resolve(fileids, categories))

class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
Expand Down
6 changes: 3 additions & 3 deletions nltk/corpus/reader/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,11 +756,11 @@ def find_corpus_fileids(root, regexp):
items = [name for name in fileids if re.match(regexp, name)]
return sorted(items)

# Find fileids in a directory: use os.walk to search all
# subdirectories, and match paths against the regexp.
# Find fileids in a directory: use os.walk to search all (proper
# or symlinked) subdirectories, and match paths against the regexp.
elif isinstance(root, FileSystemPathPointer):
items = []
for dirname, subdirs, fileids in os.walk(root.path):
for dirname, subdirs, fileids in os.walk(root.path, followlinks=True):
prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
items += [prefix+fileid for fileid in fileids
if re.match(regexp, prefix+fileid)]
Expand Down

0 comments on commit f730bc1

Please sign in to comment.