Package nltk_lite :: Package corpora :: Module treebank
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.treebank

  1  # Natural Language Toolkit: Penn Treebank Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.corpora import get_basedir 
 10  from nltk_lite import tokenize, chunk 
 11  from nltk_lite.tag import tag2tuple 
 12  from nltk_lite.parse import tree 
 13  import os 
 14   
 15  """ 
 16  Penn Treebank corpus sample: tagged, NP-chunked, and parsed data from 
 17  Wall Street Journal for 3700 sentences. 
 18   
 19  This is a ~10% fragment of the Wall Street Journal section of the Penn 
 20  Treebank, (C) LDC 1995.  It is distributed with the Natural Language Toolkit 
 21  under the terms of the Creative Commons Attribution-NonCommercial-ShareAlike License 
 22  [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 
 23   
 24  Raw: 
 25   
 26      Pierre Vinken, 61 years old, will join the board as a nonexecutive 
 27      director Nov. 29. 
 28   
 29  Tagged: 
 30   
 31      Pierre/NNP Vinken/NNP ,/, 61/CD years/NNS old/JJ ,/, will/MD join/VB  
 32      the/DT board/NN as/IN a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD ./. 
 33   
 34  NP-Chunked: 
 35   
 36      [ Pierre/NNP Vinken/NNP ] 
 37      ,/,  
 38      [ 61/CD years/NNS ] 
 39      old/JJ ,/, will/MD join/VB  
 40      [ the/DT board/NN ] 
 41      as/IN  
 42      [ a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD ] 
 43      ./. 
 44   
 45  Parsed: 
 46   
 47      ( (S  
 48        (NP-SBJ  
 49          (NP (NNP Pierre) (NNP Vinken) ) 
 50          (, ,)  
 51          (ADJP  
 52            (NP (CD 61) (NNS years) ) 
 53            (JJ old) ) 
 54          (, ,) ) 
 55        (VP (MD will)  
 56          (VP (VB join)  
 57            (NP (DT the) (NN board) ) 
 58            (PP-CLR (IN as)  
 59              (NP (DT a) (JJ nonexecutive) (NN director) )) 
 60            (NP-TMP (NNP Nov.) (CD 29) ))) 
 61        (. .) )) 
 62  """ 
 63   
64 -def parsed(files = 'parsed', basedir = None):
65 """ 66 @param files: One or more treebank files to be processed 67 @type files: L{string} or L{tuple(string)} 68 @rtype: iterator over L{tree} 69 """ 70 71 # Just one file to process? If so convert to a tuple so we can iterate 72 if type(files) is str: files = (files,) 73 74 if not basedir: basedir = get_basedir() 75 76 for file in files: 77 path = os.path.join(get_basedir(), "treebank", file) 78 s = open(path).read() 79 for t in tokenize.sexpr(s): 80 try: 81 yield tree.bracket_parse(t) 82 except IndexError: 83 # in case it's the real treebank format, 84 # strip first and last brackets before parsing 85 yield tree.bracket_parse(t[1:-1])
86
87 -def chunked(files = 'chunked', basedir = None):
88 """ 89 @param files: One or more treebank files to be processed 90 @type files: L{string} or L{tuple(string)} 91 @rtype: iterator over L{tree} 92 """ 93 94 # Just one file to process? If so convert to a tuple so we can iterate 95 if type(files) is str: files = (files,) 96 97 if not basedir: basedir = get_basedir() 98 99 for file in files: 100 path = os.path.join(basedir, "treebank", file) 101 s = open(path).read() 102 for t in tokenize.blankline(s): 103 yield chunk.tagstr2tree(t)
104
105 -def tagged(files = 'chunked', basedir = None):
106 """ 107 @param files: One or more treebank files to be processed 108 @type files: L{string} or L{tuple(string)} 109 @rtype: iterator over L{list(tuple)} 110 """ 111 112 # Just one file to process? If so convert to a tuple so we can iterate 113 if type(files) is str: files = (files,) 114 115 if not basedir: basedir = get_basedir() 116 117 for file in files: 118 path = os.path.join(get_basedir(), "treebank", file) 119 f = open(path).read() 120 for sent in tokenize.blankline(f): 121 l = [] 122 for t in tokenize.whitespace(sent): 123 if (t != '[' and t != ']'): 124 l.append(tag2tuple(t)) 125 yield l
126
127 -def raw(files = 'raw', basedir = None):
128 """ 129 @param files: One or more treebank files to be processed 130 @type files: L{string} or L{tuple(string)} 131 @rtype: iterator over L{list(string)} 132 """ 133 134 # Just one file to process? If so convert to a tuple so we can iterate 135 if type(files) is str: files = (files,) 136 137 if not basedir: basedir = get_basedir() 138 139 for file in files: 140 path = os.path.join(get_basedir(), "treebank", file) 141 f = open(path).read() 142 for sent in tokenize.blankline(f): 143 l = [] 144 for t in tokenize.whitespace(sent): 145 l.append(t) 146 yield l
147 148
149 -def demo():
150 from nltk_lite.corpora import treebank 151 from itertools import islice 152 153 print "Parsed:" 154 for tree in islice(treebank.parsed(), 3): 155 print tree.pp() 156 print 157 158 print "Chunked:" 159 for tree in islice(treebank.chunked(), 3): 160 print tree.pp() 161 print 162 163 print "Tagged:" 164 for sent in islice(treebank.tagged(), 3): 165 print sent 166 print 167 168 print "Raw:" 169 for sent in islice(treebank.raw(), 3): 170 print sent 171 print
172 173 if __name__ == '__main__': 174 demo() 175