1
2
3
4
5
6
7
8 """
9 Read from the Shakespeare XML Corpus Sample
10
11 http://www.andrew.cmu.edu/user/akj/shakespeare/
12
13 Marked up in XML by Jon Bosak, CSS stylesheet by Ajay Juneja.
14 """
15
16 import os
17 from nltk_lite.corpora import get_basedir
18 from nltk_lite.etree import ElementTree
19
20 items = ['a_and_c',
21 'dream',
22 'hamlet',
23 'j_caesar',
24 'macbeth',
25 'merchant',
26 'othello',
27 'r_and_j'
28 ]
29
39
41 from nltk_lite.corpora import shakespeare
42 from pprint import pprint
43 import re
44
45 play = shakespeare.xml('dream')
46
47 print "Access the subelements"
48 print play.getchildren()
49 print
50
51 print "Access the text content of the first subelement"
52 print play[0].text
53 print
54
55 print "Persona"
56 personae = [persona.text for persona in play.findall('PERSONAE/PERSONA')]
57 print personae
58 print
59
60 print "Are any speakers not identified as personae?"
61 names = set(re.match(r'[A-Z]*', persona).group() for persona in personae)
62 speakers = set(speaker.text for speaker in play.findall('*/*/*/SPEAKER'))
63 print speakers.difference(names)
64 print
65
66 print "who responds to whom?"
67 responds_to = {}
68 for scene in play.findall('ACT/SCENE'):
69 prev = None
70 for speaker in scene.findall('SPEECH/SPEAKER'):
71 name = speaker.text
72 if prev:
73 if prev not in responds_to:
74 responds_to[prev] = set()
75 responds_to[prev].add(name)
76 prev = name
77 pprint(responds_to)
78 print
79
80 if __name__ == '__main__':
81 demo()
82