Support symlinking to a full installation of the Penn Treebank via th…

…e ptb module (and companion modules propbank_ptb and nombank_ptb).
akaptur · Jun 18, 2012 · f730bc1 · f730bc1
1 parent c99b40f
commit f730bc1
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 6 deletions.
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -148,9 +148,9 @@
     cat_file='cats.txt', textid_file='textids.txt')
 ppattach = LazyCorpusLoader(
     'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
-# ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
-#    'ptb3', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
-#    cat_file='allcats.txt', tag_mapping_function=simplify_wsj_tag)
+ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
+    'ptb3', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
+    cat_file='allcats.txt', tag_mapping_function=simplify_wsj_tag)
 qc = LazyCorpusLoader(
     'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'])
 reuters = LazyCorpusLoader(
@@ -228,6 +228,16 @@
     'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
     lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
     treebank) # Must be defined *after* treebank corpus.
+propbank_ptb = LazyCorpusLoader(
+    'propbank', PropbankCorpusReader,
+    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
+    lambda filename: filename.upper(), 
+    ptb) # Must be defined *after* ptb corpus.
+nombank_ptb = LazyCorpusLoader(
+    'nombank.1.0', NombankCorpusReader,
+    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
+    lambda filename: filename.upper(), 
+    ptb) # Must be defined *after* ptb corpus.
 
 def demo():
     # This is out-of-date:

diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
@@ -106,6 +106,7 @@
     'MacMorphoCorpusReader', 'SyntaxCorpusReader',
     'AlpinoCorpusReader', 'RTECorpusReader',
     'StringCategoryCorpusReader','EuroparlCorpusReader',
+    'CategorizedBracketParseCorpusReader',
     'CategorizedTaggedCorpusReader',
     'CategorizedPlaintextCorpusReader',
     'PortugueseCategorizedPlaintextCorpusReader',

diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
@@ -103,6 +103,63 @@ def _tag(self, t, simplify_tags=False):
     def _word(self, t):
         return WORD.findall(self._normalize(t))
 
+class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
+                                          BracketParseCorpusReader):
+    """
+    A reader for parsed corpora whose documents are
+    divided into categories based on their file identifiers.
+    @author: Nathan Schneider <[email protected]>
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
+        the L{CategorizedCorpusReader constructor
+        <CategorizedCorpusReader.__init__>}.  The remaining arguments
+        are passed to the L{BracketParseCorpusReader constructor
+        <BracketParseCorpusReader.__init__>}.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        BracketParseCorpusReader.__init__(self, *args, **kwargs)
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+    def raw(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.raw(
+            self, self._resolve(fileids, categories))
+    def words(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.words(
+            self, self._resolve(fileids, categories))
+    def sents(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.sents(
+            self, self._resolve(fileids, categories))
+    def paras(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.paras(
+            self, self._resolve(fileids, categories))
+    def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
+        return BracketParseCorpusReader.tagged_words(
+            self, self._resolve(fileids, categories), simplify_tags)
+    def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
+        return BracketParseCorpusReader.tagged_sents(
+            self, self._resolve(fileids, categories), simplify_tags)
+    def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
+        return BracketParseCorpusReader.tagged_paras(
+            self, self._resolve(fileids, categories), simplify_tags)
+    def parsed_words(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_words(
+            self, self._resolve(fileids, categories))
+    def parsed_sents(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_sents(
+            self, self._resolve(fileids, categories))
+    def parsed_paras(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_paras(
+            self, self._resolve(fileids, categories))
+
 class AlpinoCorpusReader(BracketParseCorpusReader):
     """
     Reader for the Alpino Dutch Treebank.

diff --git a/nltk/corpus/reader/util.py b/nltk/corpus/reader/util.py
@@ -756,11 +756,11 @@ def find_corpus_fileids(root, regexp):
         items = [name for name in fileids if re.match(regexp, name)]
         return sorted(items)
 
-    # Find fileids in a directory: use os.walk to search all
-    # subdirectories, and match paths against the regexp.
+    # Find fileids in a directory: use os.walk to search all (proper
+    # or symlinked) subdirectories, and match paths against the regexp.
     elif isinstance(root, FileSystemPathPointer):
         items = []
-        for dirname, subdirs, fileids in os.walk(root.path):
+        for dirname, subdirs, fileids in os.walk(root.path, followlinks=True):
             prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
             items += [prefix+fileid for fileid in fileids
                       if re.match(regexp, prefix+fileid)]