Package nltk_lite :: Package corpora :: Module cmudict
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.cmudict

  1  # Natural Language Toolkit: Genesis Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] 
 10  ftp://ftp.cs.cmu.edu/project/speech/dict/ 
 11  Copyright 1998 Carnegie Mellon University 
 12   
 13  File Format: Each line consists of an uppercased word, a counter 
 14  (for alternative pronunciations), and a transcription.  Vowels are 
 15  marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.: 
 16  NATURAL 1 N AE1 CH ER0 AH0 L 
 17   
 18  The dictionary contains 127069 entries.  Of these, 119400 words are assigned 
 19  a unique pronunciation, 6830 words have two pronunciations, and 839 words have 
 20  three or more pronunciations.  Many of these are fast-speech variants. 
 21   
 22  Phonemes: There are 39 phonemes, as shown below: 
 23       
 24  Phoneme Example Translation    Phoneme Example Translation 
 25  ------- ------- -----------    ------- ------- ----------- 
 26  AA      odd     AA D           AE      at      AE T 
 27  AH      hut     HH AH T        AO      ought   AO T 
 28  AW      cow     K AW           AY      hide    HH AY D 
 29  B       be      B IY           CH      cheese  CH IY Z 
 30  D       dee     D IY           DH      thee    DH IY 
 31  EH      Ed      EH D           ER      hurt    HH ER T 
 32  EY      ate     EY T           F       fee     F IY 
 33  G       green   G R IY N       HH      he      HH IY 
 34  IH      it      IH T           IY      eat     IY T 
 35  JH      gee     JH IY          K       key     K IY 
 36  L       lee     L IY           M       me      M IY 
 37  N       knee    N IY           NG      ping    P IH NG 
 38  OW      oat     OW T           OY      toy     T OY 
 39  P       pee     P IY           R       read    R IY D 
 40  S       sea     S IY           SH      she     SH IY 
 41  T       tea     T IY           TH      theta   TH EY T AH 
 42  UH      hood    HH UH D        UW      two     T UW 
 43  V       vee     V IY           W       we      W IY 
 44  Y       yield   Y IY L D       Z       zee     Z IY 
 45  ZH      seizure S IY ZH ER 
 46  """ 
 47   
 48  from nltk_lite.corpora import get_basedir 
 49  import os 
 50   
 51  items = [ 
 52      'cmudict'] 
 53   
 54  item_name = { 
 55      'cmudict': 'CMU Pronunciation Dictionary, Version 0.6, 1998', 
 56  } 
 57   
58 -def raw(files = 'cmudict'):
59 """ 60 @param files: One or more cmudict files to be processed 61 @type files: L{string} or L{tuple(string)} 62 @rtype: iterator over L{tree} 63 """ 64 65 # Just one file to process? If so convert to a tuple so we can iterate 66 if type(files) is str: files = (files,) 67 68 for file in files: 69 path = os.path.join(get_basedir(), "cmudict", file) 70 for line in open(path).readlines(): 71 fields = line.strip().split(' ') 72 yield (fields[0], int(fields[1]), tuple(fields[2:]))
73
74 -def dictionary(files='cmudict'):
75 d = {} 76 for word, num, pron in raw(files): 77 if num == 1: 78 d[word] = (pron,) 79 else: 80 d[word] += (pron,) 81 return d
82
83 -def demo():
84 from nltk_lite.corpora import cmudict 85 from itertools import islice 86 87 print "raw method:" 88 for entry in islice(cmudict.raw(), 40000, 40025): 89 print entry 90 print 91 92 print "dictionary method:" 93 cmudict = cmudict.dictionary() 94 print 'NATURAL', cmudict['NATURAL'] 95 print 'LANGUAGE', cmudict['LANGUAGE'] 96 print 'TOOL', cmudict['TOOL'] 97 print 'KIT', cmudict['KIT']
98 99 if __name__ == '__main__': 100 demo() 101