Package nltk_lite :: Package contrib :: Module paradigm
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.paradigm

  1  # Natural Language Toolkit: Paradigm Visualisation 
  2  # 
  3  # Copyright (C) 2005 University of Melbourne 
  4  # Author: Will Hardy 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  # Front end to a Python implementation of David 
  9  # Penton's paradigm visualisation model. 
 10  # Author:  
 11  # 
 12  # Run: To run, first load a paradigm using 
 13  #      >>> a = paradigm('paradigm.xml') 
 14  #      And run the system to produce output 
 15  #      >>> a.show('table(one, two, three)') 
 16  # 
 17  #      Other methods: 
 18  #      demo()                   # a quick demonstration 
 19  #      a.setFormat('html')      # output is formatted as HTML 
 20  #      a.setFormat('text')      # output is formatted as HTML 
 21  #      a.setOutput('filename')  # output is sent to filename 
 22  #      a.setOutput('term')      # output is sent to terminal 
 23   
 24  from xml.dom.ext.reader import Sax2 
 25  from paradigmquery import ParadigmQuery 
 26  import re, os 
 27   
28 -class Paradigm(object):
29 """ 30 Paradigm visualisation class 31 32 *Usage* 33 34 Simple usage of the system would be: 35 >>> from paradigm import Paradigm 36 >>> p = Paradigm('german.xml') 37 >>> p.show('table(case, gender/number, content)') 38 39 Here, a table is generated in HTML format and sent to the file ``output.html``. 40 The table can be viewed in a browser, and is updated for every new query. 41 42 A more advanced usage of the system is show below. 43 The user simply creates a paradigm p, changes the output format and location, 44 and calls a dedicated prompt to enter the query: 45 >>> from paradigm import Paradigm 46 >>> p = Paradigm('german.xml') 47 >>> p.setFormat('html') 48 >>> p.setOutput('test.html') 49 >>> p.setCSS('simple.css') 50 >>> p.prompt() 51 > table(case, gender/number, content) 52 53 Please note, however, that plain text tables have not yet been implemented. 54 """ 55
56 - def __init__(self, p_filename):
57 """ 58 Load the given paradigm 59 p_filename is a string representing the filename of a paradigm xml file 60 """ 61 # Store input paradigm filename 62 self.loadParadigm(p_filename) 63 # set default values (text output, to terminal) 64 self.format = "html" 65 self.output = "output.html" 66 self.css = "simple.css"
67
68 - def prompt(self):
69 """ 70 Changes to a dedicated prompt 71 Type 'exit' or 'quit' to exit 72 """ 73 s = "" 74 while s != "exit": 75 s = "exit" 76 try: s = raw_input(">") 77 except EOFError: 78 print s 79 if s == "exit": 80 return 81 if s == "quit": 82 return 83 if s: 84 while s[-1] in "!.": s = s[:-1] 85 self.show(s)
86
87 - def show(self, p_string):
88 """ 89 Process and display the given query 90 """ 91 92 try: 93 # parse the query 94 parse = ParadigmQuery(p_string) 95 except: 96 print "Could not parse query." 97 return 98 99 try: 100 # Fetch the parsed tree and make presentation 101 result = Sentence(self, parse.getTree()) 102 # Check that a presentation actually exists 103 if result == None: 104 raise Error 105 except: 106 print "Sorry, no result can be returned" 107 return 108 109 try: 110 # Print HTML output if format is set, otherwise plain text 111 if self.format == "html": 112 output = '<html>\n' 113 # Include CSS if we need to 114 if self.css <> None: 115 output += '<link rel="stylesheet" href="' 116 output += self.css 117 output += '" type="text/css" media="screen" />\n' 118 output += '<body>' 119 output += "<table cellspacing=\"0\" cellpadding=\"0\">" 120 output += result.getHTML() 121 output += "</table>\n" 122 output += '</body></html>\n' 123 else: 124 output = result.getText() 125 except: 126 output = None 127 print "--no output--" 128 return 129 130 # Print to terminal if output is set, otherwise to file 131 if self.output == "term": 132 print output 133 else: 134 print "Output written to file:", self.output 135 f = open(self.output, 'w') 136 f.write(output) 137 138 # Return happily 139 return
140
141 - def setFormat(self, p_string=None):
142 """ 143 Set the output format: "html" or "text" 144 """ 145 # Default value 146 if p_string == None: 147 p_string = "text" 148 # set to html if requested, otherwise text 149 if p_string == "html": 150 self.format = "html" 151 elif p_string == "text": 152 self.format = "text" 153 else: 154 print "Unknown format:", p_string 155 print "Valid formats are: text, html" 156 print "Setting format = text" 157 self.format = "text"
158
159 - def setCSS(self, p_string=None):
160 """ 161 Set the file location for a Cascading Stylesheet: None or filename 162 This allows for simple formatting 163 """ 164 if p_string <> None: 165 print "Using CSS file:", p_string 166 self.output = p_string
167
168 - def setOutput(self, p_string=None):
169 """ 170 Set the output location: "term" or filename 171 """ 172 # Default 173 if p_string == None: 174 p_string = "term" 175 # set to term if requested, otherwise filename 176 if p_string == "term": 177 print "Directing output to terminal" 178 else: 179 print "Directing output to file:", p_string 180 self.output = p_string
181 182
183 - def loadParadigm(self, p_filename ):
184 """ 185 Load the given paradigm (XML file) 186 Attributes are stored in self.attributes 187 Data are stored in self.data 188 189 They can be accessed as follows: 190 self.attributes['gender'] # list of genders 191 self.data[6]['gender'] # gender for the sixth data object 192 self.data[6]['content'] # content for the sixth data object 193 """ 194 195 from nltk_lite.corpora import get_basedir 196 basedir = get_basedir() 197 198 # Look for the file 199 try_filename = os.path.join(get_basedir(), "paradigms", p_filename) 200 try: 201 f = open(try_filename) 202 p_filename = try_filename 203 except IOError: 204 print "Cannot find file" 205 return None 206 f.close() 207 208 # These variables will be set by this method 209 self.attributes = {} # A new dictionary 210 self.data = [] # A new list 211 212 # XML admin: create Reader object, parse document 213 reader = Sax2.Reader() 214 doc = reader.fromStream(p_filename) 215 216 # Cycle through the given attributes and add them to self.attributes 217 # for <name> in <attributes> 218 attributes = doc.getElementsByTagName('attributes')[0] 219 for name in attributes.getElementsByTagName('name'): 220 221 # Setup a list of attribute values 222 tmp_list = [] 223 224 # for each value under name, store in list 225 for value in name.getElementsByTagName('value'): 226 tmp_list.append(value.getAttribute('value')) 227 228 # Store list of values in dictionary 229 self.attributes[name.getAttribute('name')] = tmp_list 230 231 232 # Cycle through data objects and add them to self.data 233 # for <form> in <paradigm> 234 forms = doc.getElementsByTagName('paradigm')[0] 235 for form in forms.getElementsByTagName('form'): 236 # Initialise a temporary dictionary 237 tmp_dict = {} 238 for value in form.getElementsByTagName('attribute'): 239 tmp_dict[value.getAttribute('name')] = value.getAttribute('value') 240 # Add the new dictionary to the data list 241 self.data.append(tmp_dict) 242 243 # Talk to the user 244 print "Paradigm information successfully loaded from file:", p_filename 245 # State the number and print out a list of attributes 246 print " "*4 + str(len(self.attributes)) + " attributes imported:", 247 for att in self.attributes: 248 print att, 249 print 250 # State the number of paradigm objects imported 251 print " "*4 + str(len(self.data)) + " paradigm objects imported." 252 253 return
254
255 -class Sentence(object):
256 """ 257 Manages any operation 258 Passes request onto other handlers if necessary 259 """ 260
261 - def __init__(self, p_paradigm, p_tree):
262 """ 263 p_paradigm is the given paradigm (attributes and data) 264 p_tree is the query tree 265 """ 266 # store parameters 267 self.paradigm = p_paradigm 268 self.tree = p_tree 269 # discover the type 270 self.type = self.getType(self.tree) 271 # Handle each possible type 272 if self.type == 'O': 273 self.item = Sentence(self.paradigm, self.tree[0]) 274 if self.type == 'D': 275 self.item = Domain(self.paradigm, self.tree) 276 if self.type == 'H': 277 self.item = Hierarchy(self.paradigm, self.tree) 278 if self.type == 'T': 279 self.item = Table(self.paradigm, self.tree)
280
281 - def getList(self):
282 """ 283 Returns values in the form of a list 284 """ 285 if self.tree == None: 286 return None 287 return self.item.getList()
288
289 - def getHTML(self):
290 """ 291 Returns values in html (table) form 292 """ 293 return self.item.getHTML()
294
295 - def getHorizontalHTML(self,p_parentSpan=1):
296 """ 297 Returns values in html (table) form 298 """ 299 return self.item.getHorizontalHTML(p_parentSpan)
300
301 - def getText(self):
302 """ 303 Returns values in plain text form 304 """ 305 return self.item.getText()
306
307 - def getConditions(self):
308 """ 309 Return a list of conditions for each combination (cell) 310 """ 311 return self.item.getConditions()
312
313 - def getMaxWidth(self):
314 """ 315 Returns the width in number of characters 316 """ 317 return self.item.getMaxWidth()
318
319 - def getSpan(self):
320 """ 321 Returns the span (requred for "rowspan" and "colspan" HTML attributes) 322 """ 323 return self.item.getSpan()
324
325 - def getDepth(self):
326 """ 327 Get the depth 328 """ 329 return self.item.getDepth()
330
331 - def getType(self, p_tree=None):
332 """ 333 Determine the type of the current node of the tree 334 This need not be overridden 335 """ 336 if p_tree == None: 337 p_tree = self.tree 338 # This is in the second character of the string representation 339 return str(p_tree)[1:2]
340
341 -class Domain(Sentence):
342 """ 343 Manages a domain operation 344 345 Provides: Domain(paradigm,tree) 346 """
347 - def __init__(self, p_paradigm, p_tree):
348 """ 349 p_paradigm is the given paradigm (attributes and data) 350 p_tree is the query tree 351 """ 352 self.paradigm = p_paradigm 353 # Validate that this is a domain 354 assert self.getType(p_tree) == 'D' 355 # Store the attribute 356 self.attribute = p_tree[0] 357 self.error = None 358 # Check that the requested attribute is available 359 try: 360 self.paradigm.attributes[self.attribute] 361 except KeyError: 362 self.error = "I couldn't find this attribute: " + self.attribute 363 print self.error
364
365 - def __getitem__(self, p_index):
366 return self.paradigm.attributes[self.attribute][p_index]
367
368 - def getList(self):
369 """ 370 Return the domain in list form 371 """ 372 return self.paradigm.attributes[self.attribute]
373
374 - def getHTML(self):
375 """ 376 Return html for this domain 377 """ 378 ret_string = "" 379 for item in self.getList(): 380 ret_string += "<tr><td>" + item + "</td></tr>" 381 return ret_string
382
383 - def getHorizontalHTML(self,p_parentSpan=1):
384 """ 385 Return a horizontal html table 386 """ 387 ret_string = "" 388 for item in self.getList(): 389 ret_string += "<td>" + item + "</td>" 390 return "<tr>" + ret_string*p_parentSpan + "</tr>"
391 392
393 - def getText(self):
394 """ 395 Return text for this domain 396 """ 397 ret_string = "" 398 for item in self.getList(): 399 ret_string += item + "\n" 400 return ret_string
401
402 - def getConditions(self):
403 """ 404 Return a list of conditions for each combination (cell) 405 """ 406 ret_conds = [] 407 for item in self.getList(): 408 new = {self.attribute: item} 409 #new[self.attribute] = item 410 ret_conds.append(new) 411 return ret_conds
412
413 - def getMaxWidth(self):
414 """ 415 Get max width (chars) for display purposes 416 """ 417 max_width = 0 418 for item in self.getList(): 419 if max_width < len(item): 420 max_width = len(item) 421 return max_width
422
423 - def getSpan(self):
424 """ 425 Get the span of this domain (number of elements) 426 """ 427 return len(self.getList())
428
429 - def getDepth(self):
430 """ 431 Get the depth of this domain (always one!) 432 """ 433 return 1
434
435 -class Hierarchy(Sentence):
436 """ 437 Manages a hierarchy operation 438 439 Provides: Hierarchy(paradigm,tree) 440 """
441 - def __init__(self, p_paradigm, p_tree):
442 """ 443 p_paradigm is the given paradigm (attributes and data) 444 p_tree is the tree representation of this part of the query (Tree) 445 """ 446 self.paradigm = p_paradigm 447 self.error = None 448 449 self.tree = p_tree 450 # Validate that this is a Hierarchy 451 assert self.getType(p_tree) == 'H' 452 # Validate that the root is a Domain 453 assert self.getType(p_tree[0]) == 'D' 454 # Set the root and the leaf 455 self.root = Domain(self.paradigm, p_tree[0]) 456 self.leaf = Sentence(self.paradigm, p_tree[1])
457 458
459 - def getList(self):
460 """ 461 Return the hierarchy in list form 462 """ 463 # Get child lists 464 rootList = self.root.getList() 465 leafList = self.leaf.getList() 466 467 # Combine lists into an array 468 ret_val = [] 469 for item_root in rootList: 470 for item_leaf in leafList: 471 ret_val.append([item_root,item_leaf]) 472 473 return ret_val
474
475 - def getHTML(self):
476 """ 477 Return a html table for this hierarchy 478 """ 479 ret_string = "" 480 for index in range(len(self.root.getList())): 481 leafCells = self.leaf.getHTML()[4:] 482 ret_string += "<tr><td rowspan=\"" + str(self.leaf.getSpan()) + "\">" + self.root[index] \ 483 + "</td>" + leafCells 484 return ret_string
485
486 - def getHorizontalHTML(self,p_parentSpan=1):
487 """ 488 Return a horizontal html table 489 """ 490 ret_string = "" 491 # Add a new cell for each root item 492 for index in range(len(self.root.getList())): 493 ret_string += "<td colspan=\"" + str(self.leaf.getSpan()) + "\">" \ 494 + self.root[index] + "</td>" 495 # Recusively get the horizontalHTML from the leaf children 496 leafCells = self.leaf.getHorizontalHTML(p_parentSpan*len(self.root.getList())) 497 # Return the new row and the leaf cells 498 return "<tr>" + ret_string*p_parentSpan + "</tr>" + leafCells
499
500 - def getText(self):
501 """ 502 Return text for this hierarchy 503 """ 504 ret_string = "" 505 # Lengths for rendering display 506 max_width_root = self.root.getMaxWidth() 507 max_width_leaf = self.leaf.getMaxWidth() 508 # add root string and call getText() for leaf node 509 # (newlines in the leaf node need to have whitespace added) 510 for index in range(len(self.root.getList())): 511 ret_string += self.root[index].ljust(max_width_root) + " " \ 512 + self.leaf.getText().ljust(max_width_leaf).replace('\n',"\n" \ 513 + " "*(max_width_root+1)) + "\n" 514 # Remove any blank lines and return the string 515 re_blank = re.compile('\n[ ]+\n') 516 return re_blank.sub('\n',ret_string)
517
518 - def getConditions(self):
519 """ 520 Return a list of conditions for each combination (cell) 521 """ 522 ret_conds = [] 523 # For each root item 524 for item_r in self.root.getList(): 525 # for each leaf condition 526 for cond_l in self.leaf.getConditions(): 527 # Add the root node's condition 528 cond_l[self.root.attribute] = item_r 529 # Append this to the return list of conditions 530 ret_conds.append(cond_l) 531 # Return our list 532 return ret_conds
533
534 - def getMaxWidth(self):
535 """ 536 Return the maximum width (in chars) this hierarchy will take up 537 """ 538 return self.root.getMaxWidth() + self.leaf.getMaxWidth() + 1
539
540 - def getDepth(self):
541 """ 542 Get the depth of this hierarchy 543 """ 544 return 1 + self.leaf.getDepth()
545
546 - def getSpan(self):
547 """ 548 Get the span (for HTML tables) of this hierarchy 549 """ 550 return self.root.getSpan() * self.leaf.getSpan()
551
552 -class Table(Sentence):
553 """ 554 Manages a table operation 555 556 Provides: Table(paradigm,tree) 557 """
558 - def __init__(self, p_paradigm, p_tree):
559 """ 560 p_paradigm is the given paradigm (attributes and data) 561 p_tree is the tree representation of this part of the query (Tree) 562 """ 563 self.paradigm = p_paradigm 564 self.error = None 565 566 self.tree = p_tree 567 # Validate that this is a Table 568 assert self.getType(p_tree) == 'T' 569 # Set the table arguments 570 self.horizontal = Sentence(self.paradigm, p_tree[0]) 571 self.vertical = Sentence(self.paradigm, p_tree[1]) 572 self.cells = Sentence(self.paradigm, p_tree[2])
573 574
575 - def getList(self):
576 """ 577 Return the table (cells) in list form 578 """ 579 ret_val = [] 580 return ret_val
581
582 - def getHTML(self):
583 """ 584 Return a html table for this table operation 585 """ 586 # Start with the dead cell 587 dead_cell = "<tr><td colspan=\"" + str(self.vertical.getDepth()) \ 588 + "\" rowspan=\"" + str(self.horizontal.getDepth()) \ 589 + "\"></td>" 590 # Insert horizintal header 591 horizontal_header = self.horizontal.getHorizontalHTML()[4:].replace('td','th') 592 #horizontal_header = self.horizontal.getHorizontalHTML().replace('td','th') 593 # Get the vertical header 594 vertical_header = self.vertical.getHTML().replace('td','th') 595 str_cells = "" 596 # Reset conditions 597 conditions = {} 598 # get a list of conditions for the row 599 conditions_v = self.vertical.getConditions() 600 # for each row 601 for cond_v in conditions_v: 602 str_cells += "<tr>" 603 # get a list of conditions for the row 604 conditions_h = self.horizontal.getConditions() 605 # For each column 606 for cond_h in conditions_h: 607 # Get the data for this cell, given the hori and vert conditions 608 cell_data = self.getData(self.cells.tree, dictJoin(cond_v,cond_h)) 609 # Add the cell 610 str_cells += "<td>" + cell_data + "</td>" 611 # End the row 612 str_cells += "</tr>" 613 614 # VERTICAL HEADER INCLUSION 615 # Split rows into a list 616 vertical_header_rows = vertical_header.split('</tr>') 617 cell_rows = str_cells.replace('<tr>','').split('</tr>') 618 # Join two lists 619 zipped = zip(vertical_header_rows, cell_rows) 620 str_zipped = "" 621 for (header,cells) in zipped: 622 if header <> '': 623 str_zipped += header + cells + "</tr>\n" 624 625 # Return all the elements 626 return dead_cell + horizontal_header + str_zipped
627
628 - def getHorizontalHTML(self,p_parentSpan=1):
629 """ 630 Return a horizontal html table (?) 631 """ 632 print "?: getHorizontalHTML() called on a table." 633 return None
634
635 - def getText(self):
636 """ 637 Return text for this table (?) 638 """ 639 print "?: getText() for a table? HAHAHAHAHA" 640 print "call setFormat('html') if you want to run queries like that" 641 return
642
643 - def getConditions(self):
644 """ 645 Return conditions for this table (?) 646 """ 647 print "?: getConditions() called on a table. I don't think so." 648 return None
649
650 - def getMaxWidth(self):
651 """ 652 Return the maximum width this table could take up. 653 ... I hope you're not trying to nest tables ... 654 """ 655 return self.cells.getMaxWidth() + self.vertical.getMaxWidth() + 1
656
657 - def getSpan(self):
658 """ 659 Return span for this table (?) 660 """ 661 print "WTF: getSpan() called on a table." 662 return None
663
664 - def getData(self, p_return, p_attDict):
665 """ 666 Retrieve data that matches the given list of attributes 667 Returns (an HTML) string of values that match. 668 669 p_return is a tree pointing to the key of the value to include in the return 670 p_attDict is a dictionary of conditions. 671 """ 672 output = [] 673 return_key = p_return.leaves()[0] 674 675 # For each data object in the paradigm 676 for datum in self.paradigm.data: 677 inc = True 678 # For each given attribute requirement 679 for att in p_attDict.keys(): 680 # If the data object fails the requirement do not include 681 if datum[att] != p_attDict[att]: 682 inc = False 683 break 684 # If it passed all the tests, include it 685 if inc == True: 686 output.append(datum[return_key]) 687 688 # Return what we found (make sure this is a string) 689 if len(output) == 1: 690 return output[0] 691 else: 692 # Hardcoded HTML goodness 693 # (Obviously this will have to change for text output) 694 ret_str = "<table>" 695 for item in output: 696 ret_str += "<tr><td>" + item + "</td></tr>" 697 ret_str += "</table>" 698 return ret_str
699 700
701 -def dictJoin(dict1,dict2):
702 """ 703 A handy function to join two dictionaries 704 If there is any key overlap, dict1 wins! 705 (just make sure this doesn't happen) 706 """ 707 for key in dict1.keys(): 708 dict2[key] = dict1[key] 709 return dict2
710
711 -def demo():
712 713 # Print the query 714 print """ 715 ================================================================================ 716 Load: Paradigm(file) 717 ================================================================================ 718 """ 719 print 720 print ">>> a = Paradigm('german.xml')" 721 print 722 a = Paradigm('german.xml') 723 print 724 print ">>> a.setOutput('term')" 725 print 726 a.setOutput('term') 727 print 728 print ">>> a.setFormat('text')" 729 print 730 a.setFormat('text') 731 732 # Print a domain 733 print """ 734 ================================================================================ 735 Domain: case 736 ================================================================================ 737 """ 738 print 739 print ">>> a.show('case')" 740 print 741 a.show('case') 742 743 # Print a hierarchy 744 print """ 745 ================================================================================ 746 Hierarchy: case/gender 747 ================================================================================ 748 """ 749 print 750 print ">>> a.show('case/gender')" 751 print 752 a.show('case/gender') 753 754 # Print a table 755 print """ 756 ================================================================================ 757 Table: table(case/number,gender,content) 758 ================================================================================ 759 """ 760 print 761 print ">>> a.setOutput('demo.html')" 762 print 763 a.setOutput('demo.html') 764 print 765 print ">>> a.setFormat('html')" 766 print 767 a.setFormat('html') 768 print 769 print ">>> a.show('table(case/number,gender,content)')" 770 print 771 a.show('table(case/number,gender,content)') 772 773 # Some space 774 print
775 776 if __name__ == '__main__': 777 demo() 778