Package Bio :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Bio.PubMed

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with PubMed from the NCBI. 
  8  http://www.ncbi.nlm.nih.gov/PubMed/ 
  9   
 10  Online documentation for linking to PubMed is available at: 
 11  http://www.ncbi.nlm.nih.gov/PubMed/linking.html 
 12   
 13   
 14  Classes: 
 15  Dictionary     Access PubMed articles using a dictionary interface. 
 16   
 17  Functions: 
 18  search_for     Search PubMed. 
 19  find_related   Find related articles in PubMed. 
 20  download_many  Download many articles from PubMed in batch mode. 
 21   
 22  """ 
 23   
 24  import string 
 25  import re 
 26  import sgmllib 
 27   
 28  from Bio import File 
 29  from Bio.WWW import RequestLimiter 
 30  from Bio.WWW import NCBI 
 31  from Bio import Medline 
 32   
33 -class Dictionary:
34 """Access PubMed using a read-only dictionary interface. 35 36 Methods: 37 38 """
39 - def __init__(self, delay=5.0, parser=None):
40 """Dictionary(delay=5.0, parser=None) 41 42 Create a new Dictionary to access PubMed. parser is an optional 43 parser (e.g. Medline.RecordParser) object to change the results 44 into another form. If set to None, then the raw contents of the 45 file will be returned. delay is the number of seconds to wait 46 between each query. 47 48 """ 49 self.parser = parser 50 self.limiter = RequestLimiter(delay)
51
52 - def __len__(self):
53 raise NotImplementedError, "PubMed contains lots of entries"
54 - def clear(self):
55 raise NotImplementedError, "This is a read-only dictionary"
56 - def __setitem__(self, key, item):
57 raise NotImplementedError, "This is a read-only dictionary"
58 - def update(self):
59 raise NotImplementedError, "This is a read-only dictionary"
60 - def copy(self):
61 raise NotImplementedError, "You don't need to do this..."
62 - def keys(self):
63 raise NotImplementedError, "You don't really want to do this..."
64 - def items(self):
65 raise NotImplementedError, "You don't really want to do this..."
66 - def values(self):
67 raise NotImplementedError, "You don't really want to do this..."
68
69 - def has_key(self, id):
70 """S.has_key(id) -> bool""" 71 try: 72 self[id] 73 except KeyError: 74 return 0 75 return 1
76
77 - def get(self, id, failobj=None):
78 try: 79 return self[id] 80 except KeyError: 81 return failobj 82 raise "How did I get here?"
83
84 - def __getitem__(self, id):
85 """S.__getitem__(id) -> object 86 87 Return the Medline entry. id is either the Medline Unique ID 88 or the Pubmed ID of the article. Raises a KeyError if there's an 89 error. 90 91 """ 92 # First, check to see if enough time has passed since my 93 # last query. 94 self.limiter.wait() 95 96 try: 97 handle = NCBI.efetch( 98 db="pubmed", id=id, retmode='text', rettype='medlars') 99 except IOError, x: 100 # raise a KeyError instead of an IOError 101 # XXX I really should distinguish between a real IOError and 102 # if the id is not in the database. 103 raise KeyError, x 104 if self.parser is not None: 105 return self.parser.parse(handle) 106 return handle.read()
107
108 -def search_for(search, reldate=None, mindate=None, maxdate=None, 109 batchsize=100, delay=2, callback_fn=None, 110 start_id=0, max_ids=None):
111 """search_for(search[, reldate][, mindate][, maxdate] 112 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids 113 114 Search PubMed and return a list of the PMID's that match the 115 criteria. search is the search string used to search the 116 database. reldate is the number of dates prior to the current 117 date to restrict the search. mindate and maxdate are the dates to 118 restrict the search, e.g. 2002/01/01. batchsize specifies the 119 number of ids to return at one time. By default, it is set to 120 10000, the maximum. delay is the number of seconds to wait 121 between queries (default 2). callback_fn is an optional callback 122 function that will be called as passed a PMID as results are 123 retrieved. start_id specifies the index of the first id to 124 retrieve and max_ids specifies the maximum number of id's to 125 retrieve. 126 127 XXX The date parameters don't seem to be working with NCBI's 128 script. Please let me know if you can get it to work. 129 130 """ 131 class ResultParser(sgmllib.SGMLParser): 132 # Parse the ID's out of the XML-formatted page that PubMed 133 # returns. The format of the page is: 134 # [...] 135 # <Id>...</Id> 136 # [...] 137 def __init__(self): 138 sgmllib.SGMLParser.__init__(self) 139 self.ids = [] 140 self.in_id = 0
141 def start_id(self, attributes): 142 self.in_id = 1 143 def end_id(self): 144 self.in_id = 0 145 _not_pmid_re = re.compile(r'\D') 146 def handle_data(self, data): 147 if not self.in_id: 148 return 149 # If data is just whitespace, then ignore it. 150 data = string.strip(data) 151 if not data: 152 return 153 # Everything here should be a PMID. Check and make sure 154 # data really is one. A PMID should be a string consisting 155 # of only integers. Should I check to make sure it 156 # meets a certain minimum length? 157 if self._not_pmid_re.search(data): 158 raise SyntaxError, \ 159 "I expected an ID, but %s doesn't look like one." % \ 160 repr(data) 161 self.ids.append(data) 162 163 params = { 164 'db' : 'pubmed', 165 'term' : search, 166 'reldate' : reldate, 167 'mindate' : mindate, 168 'maxdate' : maxdate 169 } 170 for k, v in params.items(): 171 if v is None: 172 del params[k] 173 174 limiter = RequestLimiter(delay) 175 ids = [] 176 while max_ids is None or len(ids) < max_ids: 177 parser = ResultParser() 178 179 # Check to make sure enough time has passed before my 180 # last search. If not, then wait. 181 limiter.wait() 182 183 start = start_id + len(ids) 184 max = batchsize 185 if max_ids is not None and max > max_ids - len(ids): 186 max = max_ids - len(ids) 187 188 params['retstart'] = start 189 params['retmax'] = max 190 h = NCBI.esearch(**params) 191 parser.feed(h.read()) 192 ids.extend(parser.ids) 193 if callback_fn is not None: 194 # Call the callback function with each of the new ID's. 195 for id in parser.ids: 196 callback_fn(id) 197 if len(parser.ids) < max or not parser.ids: # no more id's to read 198 break 199 return ids 200 223 def start_id(self, attributes): 224 self.in_id = 1 225 def end_id(self): 226 self.in_id = 0 227 def start_link(self, attributes): 228 self.in_link = 1 229 def end_link(self): 230 self.in_link = 0 231 _not_pmid_re = re.compile(r'\D') 232 def handle_data(self, data): 233 if not self.in_link or not self.in_id: 234 return 235 # Everything here should be a PMID. Check and make sure 236 # data really is one. A PMID should be a string consisting 237 # of only integers. Should I check to make sure it 238 # meets a certain minimum length? 239 if self._not_pmid_re.search(data): 240 raise SyntaxError, \ 241 "I expected an ID, but '%s' doesn't look like one." % \ 242 repr(data) 243 self.ids.append(data) 244 245 parser = ResultParser() 246 if type(pmid) is type([]): 247 pmid = string.join(pmid, ',') 248 h = NCBI.elink(dbfrom='pubmed', id=pmid) 249 parser.feed(h.read()) 250 return parser.ids 251
252 -def download_many(ids, callback_fn, broken_fn=None, delay=120.0, faildelay=5.0, 253 batchsize=500, parser=None):
254 """download_many(ids, callback_fn[, broken_fn][, delay][, faildelay][, batchsize]) 255 256 Download many records from PubMed. ids is a list of either the 257 Medline Unique ID or the PubMed ID's of the articles. Each time a 258 record is downloaded, callback_fn is called with the text of the 259 record. broken_fn is an optional function that is called with the 260 id of records that were not able to be downloaded. delay is the 261 number of seconds to wait between requests. batchsize is the 262 number of records to request each time. 263 264 """ 265 # parser is an undocumented parameter that allows people to 266 # specify an optional parser to handle each record. This is 267 # dangerous because the results may be malformed, and exceptions 268 # in the parser may disrupt the whole download process. 269 if batchsize > 500 or batchsize < 1: 270 raise ValueError, "batchsize must be between 1 and 500" 271 limiter = RequestLimiter(delay) 272 current_batchsize = batchsize 273 274 # Loop until all the ids are processed. We want to process as 275 # many as possible with each request. Unfortunately, errors can 276 # occur. Some id may be incorrect, or the server may be 277 # unresponsive. In addition, one broken id out of a list of id's 278 # can cause a non-specific error. Thus, the strategy I'm going to 279 # take, is to start by downloading as many as I can. If the 280 # request fails, I'm going to half the number of records I try to 281 # get. If there's only one more record, then I'll report it as 282 # broken and move on. If the request succeeds, I'll double the 283 # number of records until I get back up to the batchsize. 284 nsuccesses = 0 285 while ids: 286 if current_batchsize > len(ids): 287 current_batchsize = len(ids) 288 289 id_str = ','.join(ids[:current_batchsize]) 290 291 # Make sure enough time has passed before I do another query. 292 if not nsuccesses: 293 limiter.wait(faildelay) 294 else: 295 limiter.wait() 296 try: 297 # Query PubMed. If one or more of the id's are broken, 298 # this will raise an IOError. 299 handle = NCBI.efetch( 300 db="pubmed", id=id_str, retmode='text', rettype='medlars') 301 302 # I'm going to check to make sure PubMed returned the same 303 # number of id's as I requested. If it didn't then I'm going 304 # to raise an exception. This could take a lot of memory if 305 # the batchsize is large. 306 results = handle.read() 307 num_ids = 0 308 for x in Medline.Iterator(File.StringHandle(results)): 309 num_ids = num_ids + 1 310 if num_ids != current_batchsize: 311 raise IOError 312 handle = File.StringHandle(results) 313 except IOError: # Query did not work. 314 if current_batchsize == 1: 315 # There was only 1 id in the query. Report it as 316 # broken and move on. 317 id = ids.pop(0) 318 if broken_fn is not None: 319 broken_fn(id) 320 else: 321 # I don't know which one is broken. Try again with 322 # fewer id's. 323 current_batchsize = current_batchsize / 2 324 nsuccesses = 0 325 continue 326 nsuccesses = nsuccesses + 1 327 328 # Iterate through the results and pass the records to the 329 # callback. 330 idnum = 0 331 for rec in Medline.Iterator(handle, parser): 332 callback_fn(ids[idnum], rec) 333 idnum = idnum + 1 334 335 ids = ids[current_batchsize:] 336 337 # If I'm not downloading the maximum number of articles, 338 # double the number for next time. 339 if nsuccesses >= 2 and current_batchsize < batchsize: 340 current_batchsize = current_batchsize * 2 341 if current_batchsize > batchsize: 342 current_batchsize = batchsize
343