Trees | Indices | Help |
---|
|
1 import codecs, time, urllib, re, htmlentitydefs 2 from xml.sax import xmlreader, SAXException 3 import Datatypes, ReseekFile, MultiDict 4 from xml.sax.handler import feature_external_ges 5 import POM 68 n = max([len(x) for x in htmlentitydefs.entitydefs.keys()]) 9 entity_pattern = re.compile(r"&([a-zA-Z]{1,%d});" % n) 10 11 defs = {} 12 for k, v in htmlentitydefs.entitydefs.items(): 13 if len(v) == 1: 14 defs[k] = unicode(v, "latin-1") 15 elif v[:2] == "&#" and v[-1] == ";": 16 defs[k] = unichr(int(v[2:-1])) 17 else: 18 raise AssertionError("Unexpected entitydef value: %r" % v) 19 20 return entity_pattern, defs21 22 _entity_pattern, entitydefs = _construct_pattern() 2325 mod = __import__(name) 26 for term in name.split(".")[1:]: 27 mod = getattr(mod, term) 28 return mod29 333750 51 # Pull out the "ERROR", "ErrorList", and "WarningList" terms39 module = _load_module(self.module_name) 40 cb = GetObject() 41 parser = POM.get_parser(callback = cb, module = module) 42 # This tells the parser to not resolve the NCBI DTDs 43 try: 44 parser.setFeature(feature_external_ges, 0) 45 SAXException 46 except SAXException: 47 pass 48 parser.parse(file) 49 return cb.obj53 errmsg = None 54 errors = [] 55 warnings = [] 56 57 err = pom.get("ERROR", None) 58 if err is not None: 59 errmsg = err.tostring() 60 61 for x in pom.get("ErrorList", []): 62 errors.append( 63 Datatypes.problem_category_mapping[x.__class__.__name__]( 64 x.tostring())) 65 for x in pom.get("WarningList", []): 66 warnings.append( 67 Datatypes.problem_category_mapping[x.__class__.__name__]( 68 x.tostring())) 69 return errmsg, errors, warnings70 7173 reseekfile = ReseekFile.ReseekFile(infile) 74 s = reseekfile.read(500) 75 reseekfile.seek(0) 76 reseekfile.nobuffer() 77 78 lines = s.split("\n") 79 if len(lines) > 3: 80 if lines[0] == "<Html>": 81 if lines[2].find("<h2>Error occured:") != 1: 82 s = re.findall(r"Error occured:([^<]+)", lines[2])[0] 83 s = urllib.unquote(s) 84 raise Datatypes.EUtilsError(s) 85 raise Datatypes.EUtilsError("Unknown error:\n" + 86 reseekfile.read(1000)) 87 88 # On error, fetch can return a valid XML document, but not one 89 # which matches the DTD. Rather than change the DTD (which is 90 # pubmed_020114.dtd) I'll check it here to raise the error. 91 # XXX HACK! 92 if lines[2] == "<pmFetchResult>": 93 # <pmFetchResult> 94 # \t<ERROR>Empty id list - nothing todo</ERROR> 95 # </pmFetchResult> 96 s = "Unable to parse pmFetchResult error message" 97 if len(lines) > 4: 98 s = re.findall(r"<ERROR>([^>]+)</ERROR>", lines[3])[0] 99 raise Datatypes.EUtilsError(s) 100 101 # This happens when you choose a database which doesn't exist 102 # Are there other reasons? Probably yes, if you choose 103 # other illegal parameters. 104 if lines[0].startswith("<!doctype"): 105 raise Datatypes.EUtilsError("Parameter not allowed") 106 107 if force_encoding and lines[0].startswith('<?xml version="1.0"?>'): 108 # Doesn't use an encoding, which means the XML is supposed 109 # to be in UTF-8 encoding. However, it seems NCBI uses 110 # Latin-1 so we need to translate the Latin-1 input to 111 # UTF-8 output else the XML parsers will fail for non-ASCII 112 # characters. 113 reseekfile = codecs.EncodedFile(reseekfile, "utf-8", "iso-8859-1") 114 115 return reseekfile116 117 ############################## 118120 # Need to pull out the webenv from the input stream 121 infile = _check_for_bad_input_stream(infile) 122 123 xml_parser = UsePOMParser("eSearch_020511") 124 pom = xml_parser.parse_using_dtd(infile) 125 126 errmsg, errors, warnings = _check_for_errors(pom) 127 # ErrorList (PhraseNotFound*,FieldNotFound*)> 128 # WarningList (PhraseIgnored*, 129 # QuotedPhraseNotFound*, 130 # OutputMessage*)> 131 132 # If it's only "PhraseNotFound" erros, with an 133 # OutputMessage of "No items found." then personally 134 # think that should be considered the same as a search 135 # which returned no results. 136 137 # Set things up for an empty match 138 webenv = None 139 query_key = None 140 count = 0 141 retmax = 0 142 retstart = 0 143 ids = [] 144 translation_set = {} 145 expression = None 146 147 nothing_matched = 0 148 if errmsg == "Can't run executor": 149 # Check that the error list only contains PhraseNotFound terms 150 flg = 1 151 for x in errors: 152 if x.category != "PhraseNotFound": 153 flg = 0 154 break 155 if flg: 156 # Okay, only PhraseNotFound. Make sure there is 157 # only one OutputMessage, with the text "No items found." 158 # (Eg, an OutputMessage of 'Query syntax error.' means 159 # there was a real problem.) 160 msgs = [x for x in warnings if x.category == "OutputMessage"] 161 if len(msgs) == 1 and msgs[0].text == "No items found.": 162 nothing_matched = 1 163 164 if not nothing_matched: 165 # This is an error 166 raise Datatypes.EUtilsSearchError(errmsg, 167 errors, 168 warnings) 169 170 # In other words, check if something matched 171 if not nothing_matched: 172 ## Get WebEnv, if it exists 173 if pom.get_element("WebEnv") is not None: 174 s = pom["WebEnv"].tostring() 175 webenv = urllib.unquote(s) 176 # ONLY change webenv_ref if there's a new one 177 webenv_ref[0] = webenv 178 179 # Other simple fields 180 if pom.get_element("QueryKey") is not None: 181 query_key = pom["QueryKey"].tostring() 182 183 count = int(pom["Count"].tostring()) 184 retmax = int(pom["RetMax"].tostring()) 185 retstart = int(pom["RetStart"].tostring()) 186 187 # The identifiers (if any) 188 # NOTE: not a DBIds because the search result doesn't list the 189 # database searched! 190 ids = [x.tostring() for x in pom["IdList"].find_elements("Id")] 191 192 # TranslationSet 193 translation_set = {} 194 for ele in pom["TranslationSet"]: 195 translation_set[urllib.unquote_plus(ele["From"].tostring())] = \ 196 urllib.unquote_plus(ele["To"].tostring()) 197 198 # Convert the RPN TranslationStack into an Expression 199 stack = [] 200 try: 201 translation_stack = pom["TranslationStack"] 202 except IndexError: 203 translation_stack = [] 204 for ele in translation_stack: 205 if ele.__class__.__name__ == "TermSet": 206 stack.append(Datatypes.Term( 207 term = urllib.unquote_plus(ele["Term"].tostring()), 208 field = urllib.unquote_plus(ele["Field"].tostring()), 209 count = int(ele["Count"].tostring()), 210 explode = ele["Explode"].tostring())) 211 elif ele.__class__.__name__ == "OP": 212 s = ele.tostring().strip() 213 if s == "AND": 214 stack[-2:] = [stack[-2] & stack[-1]] 215 elif s == "OR": 216 stack[-2:] = [stack[-2] | stack[-1]] 217 elif s == "RANGE": 218 stack[-2:] = [Datatypes.Range(stack[-2], stack[-1])] 219 elif s == "NOT": 220 stack[-2:] = [Datatypes.Not(stack[-2], stack[-1])] 221 elif s == "GROUP": 222 # GROUP doesn't appear to do any more than put an extra 223 # parenthesis around ANDs and ORs -- can't find any 224 # specific documentation on its role 225 # So right now it is redundant and just ignore it 226 pass 227 else: 228 raise TypeError("Unknown OP code: %r" % (s,)) 229 else: 230 raise TypeError("Unknown TranslationStack element: %r" % 231 (ele.__class__.__name__,)) 232 233 # hack -- it appears as if the translation stack is sometimes missing 234 # an AND at the end, which I guess is supposed to be implicit. For 235 # instance, doing a text word search plus date range leaves off a 236 # trailing and to link the final elements. 237 if len(stack) == 2: 238 stack[-2:] = [stack[-2] & stack[-1]] 239 240 if len(stack) > 1: 241 raise TypeError("Incomplete TranslationStack: %r" % stack) 242 elif not stack: 243 stack = [None] 244 245 expression = stack[0] 246 247 # Return either our synthesized query or 248 search_result = Datatypes.SearchResult(count, retmax, retstart, ids, 249 translation_set, expression, 250 webenv, query_key, errors, 251 warnings, time.time()) 252 253 return search_result254 255 256 ########################### 257259 # It doesn't look like I need check for a bad input stream 260 # since I can only generate two types of error messages 261 262 # ePost_020511.dtd 263 xml_parser = UsePOMParser("ePost_020511") 264 pom = xml_parser.parse_using_dtd(infile) 265 266 # If there was an ERROR, raise it now 267 errmsg, errors, warnings = _check_for_errors(pom) 268 if errmsg is not None: 269 raise Datatypes.EUtilsError(errmsg) 270 271 # Get any invalid identifies 272 invalid_ids = [x.tostring() for x in pom.get("InvalidIdList", [])] 273 274 # Otherwise, get the WebEnv string 275 s = pom["WebEnv"].tostring() 276 webenv = urllib.unquote(s) 277 webenv_ref[0] = webenv 278 279 query_key = pom["QueryKey"].tostring() 280 281 return Datatypes.PostResult(webenv, query_key, invalid_ids, time.time())282 283 ############################### 284 285 286 # PubDate: '2000 Feb 1' or '1975 Jun' or '1995' 287 # BLAH! PubMed 8318652 also has "1993 May-Jun" for 288 _pubdate_format1 = re.compile( 289 r"(?P<year>\d{4})( (?P<month>[A-Za-z]{3})( (?P<day>\d+))?)?$") 290 _pubdate_format2 = re.compile( 291 r"(?P<year>\d{4}) (?P<month1>[A-Za-z]{3})-(?P<month2>[A-Za-z]{3})") 292 293 _month_names_to_number = { 294 None: 1, 295 "Jan": 1, 296 "Feb": 2, 297 "Mar": 3, 298 "Apr": 4, 299 "May": 5, 300 "Jun": 6, 301 "Jul": 7, 302 "Aug": 8, 303 "Sep": 9, 304 "Oct": 10, 305 "Nov": 11, 306 "Dec": 12, 307 } 308 309 310 311 # Ignoring the hour and minute parts -- they seem to be either 312 # midnight or 09:00 and since I don't know the timezone it seems 313 # rather pointless 314 # EntrezDate: 2000/02/17 09:00 315 _entrezdate_format = re.compile(r"(?P<year>\d+)/(?P<month>\d+)/(?P<day>\d+)") 316 317 # This may not be the right way to do this. 318 # Perhaps should keep the string and only translate upon request 319 # to a given time format? 322324 # Can be in one of several different formats 325 m = _pubdate_format1.match(s) 326 if m is not None: 327 # 2000 Feb 15 328 d = {} 329 d["year"] = int(m.group("year")) 330 d["month"] = _month_names_to_number[m.group("month")] 331 try: 332 d["day"] = int(m.group("day")) 333 except TypeError: # if this is None 334 d["day"] = 1 335 return Datatypes.Date(**d) 336 m = _pubdate_format2.match(s) 337 if m is not None: 338 # 1993 May-Jun 339 d = {} 340 d["year"] = int(m.group("year")) 341 d["month"] = _month_names_to_number[m.group("month1")] 342 d["day"] = 1 343 return Datatypes.Date(**d) 344 345 m = _entrezdate_format.match(s) 346 if m is not None: 347 return Datatypes.Date(year = int(m.group("year"),), 348 month = int(m.group("month")), 349 day = int(m.group("day"))) 350 351 raise TypeError("Unknown date format: %s" % (s,))352354 if "&" not in s: 355 return unicode(s) 356 357 terms = [] 358 i = 0 359 defs = entitydefs 360 for m in _entity_pattern.finditer(s): 361 terms.append(s[i:m.start()]) 362 try: 363 terms.append(defs[m.group(1)]) 364 except KeyError: 365 terms.append(m.group(0)) 366 i = m.end() 367 terms.append(s[i:]) 368 return "".join(terms)369371 # The text may have HTML entity definitions .. convert as needed 372 # 373 # XXX Is this correct? Most other characters are properly 374 # encoded. This may mean that that data provider messed up and 375 # sent data in the wrong format. 376 return unescape_entities(x.tostring())377 380 383385 # XXX I'm not doing this as a list.. Should I? 386 return convert_summary_Items(x.find_elements("Item"))387389 d = MultiDict.OrderedMultiDict() 390 for item in x: 391 name = item.Name 392 if name in d: 393 print "Found multiple Items named %r!" % (name,) 394 d[name] = summary_type_parser_table[item.Type](item) 395 return d396 397 summary_type_parser_table = { 398 "String": convert_summary_String, 399 "Integer": convert_summary_Integer, 400 "Unknown": convert_summary_Unknown, 401 "Date": convert_summary_Date, 402 "List": convert_summary_List, 403 } 404406 infile = _check_for_bad_input_stream(infile) 407 xml_parser = UsePOMParser("eSummary_020511") 408 pom = xml_parser.parse_using_dtd(infile) 409 errmsg, errors, warnings = _check_for_errors(pom) 410 if errmsg is not None: 411 raise Datatypes.EUtilsError(errmsg) 412 413 results = [] 414 for docsum in pom: 415 id = docsum["Id"].tostring() 416 d = convert_summary_Items(docsum.find_elements("Item")) 417 results.append(Datatypes.Summary(id, d)) 418 419 return results420 421 ############################### 422 423 # XML425 infile = _check_for_bad_input_stream(infile, force_encoding = 0) 426 xml_parser = UsePOMParser("pubmed_020114") 427 return xml_parser.parse_using_dtd(infile)428 431 432 # Identifer list ("\n" separated) 433 # Useful for "uilist", "acc", and a few others435 infile = _check_for_bad_input_stream(infile) 436 return [x.strip() for x in infile.readlines() if x != "\n"]437 438 ###############################440 if not pom.has_key("LinkSet"): 441 if pom.has_key("ERROR"): 442 raise Datatypes.EUtilsError(pom["ERROR"].tostring()) 443 raise Datatypes.EUtilsError("Server failed to process request") 444 if len(pom.find_elements("LinkSet")) != 1: 445 raise AssertionError( 446 "Did not expect to find more than one LinkSet in the XML") 447 linkset = pom["LinkSet"] 448 if linkset.has_key("ERROR"): 449 raise Datatypes.EUtilsError(linkset["ERROR"].tostring())450452 #infile = _check_for_bad_input_stream(infile) 453 # Need this, as seen in 454 # http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&cmd=llinks&db=pubmed&id=10611131%2C12085853 455 # which has an a-with-umlaut in Latin-1 encoding 456 457 infile = codecs.EncodedFile(infile, "utf-8", "iso-8859-1") 458 xml_parser = UsePOMParser("eLink_020511") 459 pom = xml_parser.parse_using_dtd(infile) 460 _check_for_link_errors(pom) 461 return pom462464 pom = _parse_link(infile) 465 pom_linkset = pom["LinkSet"] 466 dbfrom = pom_linkset["DbFrom"].tostring().lower() 467 idlist = [x.tostring() for x in pom_linkset["IdList"].find_elements("Id")] 468 469 linksetdbs = MultiDict.OrderedMultiDict() 470 for pom_linksetdb in pom_linkset.find_elements("LinkSetDb"): 471 if pom_linksetdb.has_key("ERROR"): 472 raise Datatypes.EUtilsError(pom_linksetdb["ERROR"].tostring()) 473 dbto = pom_linksetdb["DbTo"].tostring().lower() 474 linkname = pom_linksetdb["LinkName"].tostring() 475 links = [] 476 for pom_link in pom_linksetdb.find_elements("Link"): 477 score = pom_link.get("Score") 478 if score is not None: 479 score = int(score.tostring()) 480 links.append(Datatypes.Link(pom_link["Id"].tostring(), score)) 481 linksetdbs[linkname] = Datatypes.LinkSetDb(dbto, linkname, links) 482 return Datatypes.NeighborLinkSet(Datatypes.DBIds(dbfrom.lower(), idlist), 483 linksetdbs)484 485487 pom = _parse_link(infile) 488 pom_linkset = pom["LinkSet"] 489 dbfrom = pom_linkset["DbFrom"].tostring().lower() 490 idchecks = [] 491 for ele in pom_linkset["IdCheckList"].find_elements("Id"): 492 has_linkout = getattr(ele, "HasLinkOut", "N") 493 has_linkout = {"Y": 1}.get(has_linkout, 0) 494 has_neighbor = getattr(ele, "HasNeighbor", "N") 495 has_neighbor = {"Y": 1}.get(has_neighbor, 0) 496 idchecks.append(Datatypes.IdCheck(ele.tostring(), 497 has_linkout, 498 has_neighbor)) 499 return Datatypes.CheckLinkSet(dbfrom, idchecks)500 501 parse_ncheck = parse_lcheck 502504 x = ele.get(name) 505 if x is None: 506 return None 507 s = x.tostring() 508 if not s: 509 return None 510 return s511513 pom = _parse_link(infile) 514 pom_linkset = pom["LinkSet"] 515 dbfrom = pom_linkset["DbFrom"].tostring().lower() 516 idurlsets = [] 517 for ele in pom_linkset["IdUrlList"].find_elements("IdUrlSet"): 518 id = ele["Id"].tostring() 519 objurls = [] 520 for pom_objurl in ele.find_elements("ObjUrl"): 521 url = _get_opt_string(pom_objurl, "Url") 522 linkname = _get_opt_string(pom_objurl, "LinkName") 523 subject_types = [x.tostring() for x in 524 pom_objurl.find_elements("SubjectType")] 525 attributes = [s.tostring() for s in pom_objurl.find_elements("Attribute")] 526 527 pom_provider = pom_objurl["Provider"] 528 provider_name = pom_provider["Name"].tostring() 529 provider_name_abbr = pom_provider["NameAbbr"].tostring() 530 provider_id = pom_provider["Id"].tostring() 531 provider_url = _get_opt_string(pom_provider, "Url") 532 provider_icon_url = _get_opt_string(pom_provider, "IconUrl") 533 534 provider = Datatypes.Provider(provider_name, 535 provider_name_abbr, 536 provider_id, 537 provider_url, 538 provider_icon_url) 539 objurl = Datatypes.ObjUrl(subject_types, provider, 540 linkname, url, attributes) 541 objurls.append(objurl) 542 543 idurlsets.append(Datatypes.IdUrlSet(id, objurls)) 544 545 return Datatypes.LinksLinkSet(dbfrom, idurlsets)546 547 parse_prlinks = parse_llinks 548550 infile = _check_for_bad_input_stream(infile) 551 xml_parser = UsePOMParser("eLink_020511") 552 return xml_parser.parse_using_dtd(infile)553
Trees | Indices | Help |
---|
Generated by Epydoc 3.0.1 on Mon Sep 15 09:28:15 2008 | http://epydoc.sourceforge.net |