Package Bio :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Bio.PubMed

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with PubMed from the NCBI. 
  8  http://www.ncbi.nlm.nih.gov/PubMed/ 
  9   
 10  Online documentation for linking to PubMed is available at: 
 11  http://www.ncbi.nlm.nih.gov/PubMed/linking.html 
 12   
 13   
 14  Classes: 
 15  Dictionary     Access PubMed articles using a dictionary interface. 
 16   
 17  Functions: 
 18  search_for     Search PubMed. 
 19  find_related   Find related articles in PubMed. 
 20  download_many  Download many articles from PubMed in batch mode. 
 21   
 22  """ 
 23   
 24  import re 
 25  import sgmllib 
 26   
 27  from Bio import File 
 28  from Bio import Entrez 
 29  from Bio import Medline 
 30   
31 -class Dictionary:
32 """Access PubMed using a read-only dictionary interface. 33 34 Methods: 35 36 """
37 - def __init__(self, parser=None):
38 """Dictionary(parser=None) 39 40 Create a new Dictionary to access PubMed. parser is an optional 41 parser (e.g. Medline.RecordParser) object to change the results 42 into another form. If set to None, then the raw contents of the 43 file will be returned. 44 45 """ 46 self.parser = parser
47
48 - def __len__(self):
49 raise NotImplementedError, "PubMed contains lots of entries"
50 - def clear(self):
51 raise NotImplementedError, "This is a read-only dictionary"
52 - def __setitem__(self, key, item):
53 raise NotImplementedError, "This is a read-only dictionary"
54 - def update(self):
55 raise NotImplementedError, "This is a read-only dictionary"
56 - def copy(self):
57 raise NotImplementedError, "You don't need to do this..."
58 - def keys(self):
59 raise NotImplementedError, "You don't really want to do this..."
60 - def items(self):
61 raise NotImplementedError, "You don't really want to do this..."
62 - def values(self):
63 raise NotImplementedError, "You don't really want to do this..."
64
65 - def has_key(self, id):
66 """S.has_key(id) -> bool""" 67 try: 68 self[id] 69 except KeyError: 70 return 0 71 return 1
72
73 - def get(self, id, failobj=None):
74 try: 75 return self[id] 76 except KeyError: 77 return failobj 78 raise "How did I get here?"
79
80 - def __getitem__(self, id):
81 """S.__getitem__(id) -> object 82 83 Return the Medline entry. id is either the Medline Unique ID 84 or the Pubmed ID of the article. Raises a KeyError if there's an 85 error. 86 87 """ 88 try: 89 handle = Entrez.efetch( 90 db="pubmed", id=id, retmode='text', rettype='medlars') 91 except IOError, x: 92 # raise a KeyError instead of an IOError 93 # XXX I really should distinguish between a real IOError and 94 # if the id is not in the database. 95 raise KeyError, x 96 if self.parser is not None: 97 return self.parser.parse(handle) 98 return handle.read()
99
100 -def search_for(search, reldate=None, mindate=None, maxdate=None, 101 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
102 """search_for(search[, reldate][, mindate][, maxdate] 103 [, batchsize][, callback_fn][, start_id][, max_ids]) -> ids 104 105 Search PubMed and return a list of the PMID's that match the 106 criteria. search is the search string used to search the 107 database. reldate is the number of dates prior to the current 108 date to restrict the search. mindate and maxdate are the dates to 109 restrict the search, e.g. 2002/01/01. batchsize specifies the 110 number of ids to return at one time. By default, it is set to 111 10000, the maximum. callback_fn is an optional callback function 112 that will be called as passed a PMID as results are retrieved. 113 start_id specifies the index of the first id to retrieve and 114 max_ids specifies the maximum number of id's to retrieve. 115 116 XXX The date parameters don't seem to be working with NCBI's 117 script. Please let me know if you can get it to work. 118 119 """ 120 class ResultParser(sgmllib.SGMLParser): 121 # Parse the ID's out of the XML-formatted page that PubMed 122 # returns. The format of the page is: 123 # [...] 124 # <Id>...</Id> 125 # [...] 126 def __init__(self): 127 sgmllib.SGMLParser.__init__(self) 128 self.ids = [] 129 self.in_id = 0
130 def start_id(self, attributes): 131 self.in_id = 1 132 def end_id(self): 133 self.in_id = 0 134 _not_pmid_re = re.compile(r'\D') 135 def handle_data(self, data): 136 if not self.in_id: 137 return 138 # If data is just whitespace, then ignore it. 139 data = data.strip() 140 if not data: 141 return 142 # Everything here should be a PMID. Check and make sure 143 # data really is one. A PMID should be a string consisting 144 # of only integers. Should I check to make sure it 145 # meets a certain minimum length? 146 if self._not_pmid_re.search(data): 147 raise ValueError, \ 148 "I expected an ID, but %s doesn't look like one." % \ 149 repr(data) 150 self.ids.append(data) 151 152 params = { 153 'db' : 'pubmed', 154 'term' : search, 155 'reldate' : reldate, 156 'mindate' : mindate, 157 'maxdate' : maxdate 158 } 159 for k, v in params.items(): 160 if v is None: 161 del params[k] 162 163 ids = [] 164 while max_ids is None or len(ids) < max_ids: 165 parser = ResultParser() 166 167 start = start_id + len(ids) 168 max = batchsize 169 if max_ids is not None and max > max_ids - len(ids): 170 max = max_ids - len(ids) 171 172 params['retstart'] = start 173 params['retmax'] = max 174 h = Entrez.esearch(**params) 175 parser.feed(h.read()) 176 ids.extend(parser.ids) 177 if callback_fn is not None: 178 # Call the callback function with each of the new ID's. 179 for id in parser.ids: 180 callback_fn(id) 181 if len(parser.ids) < max or not parser.ids: # no more id's to read 182 break 183 return ids 184 207 def start_id(self, attributes): 208 self.in_id = 1 209 def end_id(self): 210 self.in_id = 0 211 def start_link(self, attributes): 212 self.in_link = 1 213 def end_link(self): 214 self.in_link = 0 215 _not_pmid_re = re.compile(r'\D') 216 def handle_data(self, data): 217 if not self.in_link or not self.in_id: 218 return 219 # Everything here should be a PMID. Check and make sure 220 # data really is one. A PMID should be a string consisting 221 # of only integers. Should I check to make sure it 222 # meets a certain minimum length? 223 if self._not_pmid_re.search(data): 224 raise ValueError, \ 225 "I expected an ID, but '%s' doesn't look like one." % \ 226 repr(data) 227 self.ids.append(data) 228 229 parser = ResultParser() 230 if type(pmid) is type([]): 231 pmid = ','.join(pmid) 232 h = Entrez.elink(dbfrom='pubmed', id=pmid) 233 parser.feed(h.read()) 234 return parser.ids 235
236 -def download_many(ids, callback_fn, broken_fn=None, 237 batchsize=500, parser=None):
238 """download_many(ids, callback_fn[, broken_fn][, batchsize]) 239 240 Download many records from PubMed. ids is a list of either the 241 Medline Unique ID or the PubMed ID's of the articles. Each time a 242 record is downloaded, callback_fn is called with the text of the 243 record. broken_fn is an optional function that is called with the 244 id of records that were not able to be downloaded. batchsize is the 245 number of records to request each time. 246 247 """ 248 # parser is an undocumented parameter that allows people to 249 # specify an optional parser to handle each record. This is 250 # dangerous because the results may be malformed, and exceptions 251 # in the parser may disrupt the whole download process. 252 if batchsize > 500 or batchsize < 1: 253 raise ValueError, "batchsize must be between 1 and 500" 254 current_batchsize = batchsize 255 256 # Loop until all the ids are processed. We want to process as 257 # many as possible with each request. Unfortunately, errors can 258 # occur. Some id may be incorrect, or the server may be 259 # unresponsive. In addition, one broken id out of a list of id's 260 # can cause a non-specific error. Thus, the strategy I'm going to 261 # take, is to start by downloading as many as I can. If the 262 # request fails, I'm going to half the number of records I try to 263 # get. If there's only one more record, then I'll report it as 264 # broken and move on. If the request succeeds, I'll double the 265 # number of records until I get back up to the batchsize. 266 nsuccesses = 0 267 while ids: 268 if current_batchsize > len(ids): 269 current_batchsize = len(ids) 270 271 id_str = ','.join(ids[:current_batchsize]) 272 273 try: 274 # Query PubMed. If one or more of the id's are broken, 275 # this will raise an IOError. 276 handle = Entrez.efetch( 277 db="pubmed", id=id_str, retmode='text', rettype='medlars') 278 279 # I'm going to check to make sure PubMed returned the same 280 # number of id's as I requested. If it didn't then I'm going 281 # to raise an exception. This could take a lot of memory if 282 # the batchsize is large. 283 results = handle.read() 284 num_ids = 0 285 for x in Medline.Iterator(File.StringHandle(results)): 286 num_ids = num_ids + 1 287 if num_ids != current_batchsize: 288 raise IOError 289 handle = File.StringHandle(results) 290 except IOError: # Query did not work. 291 if current_batchsize == 1: 292 # There was only 1 id in the query. Report it as 293 # broken and move on. 294 id = ids.pop(0) 295 if broken_fn is not None: 296 broken_fn(id) 297 else: 298 # I don't know which one is broken. Try again with 299 # fewer id's. 300 current_batchsize = current_batchsize / 2 301 nsuccesses = 0 302 continue 303 nsuccesses = nsuccesses + 1 304 305 # Iterate through the results and pass the records to the 306 # callback. 307 idnum = 0 308 for rec in Medline.Iterator(handle, parser): 309 callback_fn(ids[idnum], rec) 310 idnum = idnum + 1 311 312 ids = ids[current_batchsize:] 313 314 # If I'm not downloading the maximum number of articles, 315 # double the number for next time. 316 if nsuccesses >= 2 and current_batchsize < batchsize: 317 current_batchsize = current_batchsize * 2 318 if current_batchsize > batchsize: 319 current_batchsize = batchsize
320