Package nltk_lite :: Package corpora :: Module gutenberg
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.gutenberg

 1  # Natural Language Toolkit: Gutenberg Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read tokens from the NLTK Gutenberg Corpus. 
11   
12  Project Gutenberg  --  http://gutenberg.net/ 
13   
14  This corpus contains selected texts from Project Gutenberg: 
15   
16  * Jane Austen (3) 
17  * William Blake (2) 
18  * G. K. Chesterton (3) 
19  * King James Bible 
20  * John Milton 
21  * William Shakespeare (3) 
22  * Walt Whitman 
23  """        
24   
25  from nltk_lite.corpora import get_basedir 
26  from nltk_lite import tokenize 
27  import os, re 
28   
29  items = [ 
30    'austen-emma', 
31    'austen-persuasion', 
32    'austen-sense', 
33    'bible-kjv', 
34    'blake-poems', 
35    'blake-songs', 
36    'chesterton-ball', 
37    'chesterton-brown', 
38    'chesterton-thursday', 
39    'milton-paradise', 
40    'shakespeare-caesar', 
41    'shakespeare-hamlet', 
42    'shakespeare-macbeth', 
43    'whitman-leaves' 
44  ]     
45   
46  item_name = { 
47    'austen-emma':         'Jane Austen: Emma', 
48    'austen-persuasion':   'Jane Austen: Persuasion', 
49    'austen-sense':        'Jane Austen: Sense and Sensibility', 
50    'bible-kjv':           'King James Bible', 
51    'blake-poems':         'William Blake: Poems', 
52    'blake-songs':         'Willian Blake: Songs of Innocence and Experience', 
53    'chesterton-ball':     'G.K. Chesterton: The Ball and The Cross', 
54    'chesterton-brown':    'G.K. Chesterton: The Wisdom of Father Brown', 
55    'chesterton-thursday': 'G.K. Chesterton: The Man Who Was Thursday', 
56    'milton-paradise':     'John Milton: Paradise Lost', 
57    'shakespeare-caesar':  'William Shakespeare: Julius Caesar', 
58    'shakespeare-hamlet':  'William Shakespeare: Hamlet', 
59    'shakespeare-macbeth': 'William Shakespeare: Macbeth', 
60    'whitman-leaves':      'Walt Whitman: Leaves of Grass', 
61  } 
62   
63   
64 -def raw(files = items):
65 if type(files) is str: files = (files,) 66 67 for file in files: 68 path = os.path.join(get_basedir(), "gutenberg", file + ".txt") 69 f = open(path) 70 preamble = True 71 for line in f.readlines(): 72 if not preamble: 73 for t in tokenize.wordpunct(line): 74 yield t 75 if line[:5] == '*END*': 76 preamble = False
77
78 -def demo():
79 from nltk_lite.corpora import gutenberg 80 from itertools import islice 81 82 for word in islice(gutenberg.raw('bible-kjv'), 0, 100): 83 print word,
84 85 if __name__ == '__main__': 86 demo() 87