1
2
3
4
5
6 """
7 This module provides code to work with PubMed from the NCBI.
8 http://www.ncbi.nlm.nih.gov/PubMed/
9
10 Online documentation for linking to PubMed is available at:
11 http://www.ncbi.nlm.nih.gov/PubMed/linking.html
12
13
14 Classes:
15 Dictionary Access PubMed articles using a dictionary interface.
16
17 Functions:
18 search_for Search PubMed.
19 find_related Find related articles in PubMed.
20 download_many Download many articles from PubMed in batch mode.
21
22 """
23
24 import string
25 import re
26 import sgmllib
27
28 from Bio import File
29 from Bio.WWW import RequestLimiter
30 from Bio.WWW import NCBI
31 from Bio import Medline
32
34 """Access PubMed using a read-only dictionary interface.
35
36 Methods:
37
38 """
39 - def __init__(self, delay=5.0, parser=None):
40 """Dictionary(delay=5.0, parser=None)
41
42 Create a new Dictionary to access PubMed. parser is an optional
43 parser (e.g. Medline.RecordParser) object to change the results
44 into another form. If set to None, then the raw contents of the
45 file will be returned. delay is the number of seconds to wait
46 between each query.
47
48 """
49 self.parser = parser
50 self.limiter = RequestLimiter(delay)
51
53 raise NotImplementedError, "PubMed contains lots of entries"
55 raise NotImplementedError, "This is a read-only dictionary"
57 raise NotImplementedError, "This is a read-only dictionary"
59 raise NotImplementedError, "This is a read-only dictionary"
61 raise NotImplementedError, "You don't need to do this..."
63 raise NotImplementedError, "You don't really want to do this..."
65 raise NotImplementedError, "You don't really want to do this..."
67 raise NotImplementedError, "You don't really want to do this..."
68
70 """S.has_key(id) -> bool"""
71 try:
72 self[id]
73 except KeyError:
74 return 0
75 return 1
76
77 - def get(self, id, failobj=None):
78 try:
79 return self[id]
80 except KeyError:
81 return failobj
82 raise "How did I get here?"
83
85 """S.__getitem__(id) -> object
86
87 Return the Medline entry. id is either the Medline Unique ID
88 or the Pubmed ID of the article. Raises a KeyError if there's an
89 error.
90
91 """
92
93
94 self.limiter.wait()
95
96 try:
97 handle = NCBI.efetch(
98 db="pubmed", id=id, retmode='text', rettype='medlars')
99 except IOError, x:
100
101
102
103 raise KeyError, x
104 if self.parser is not None:
105 return self.parser.parse(handle)
106 return handle.read()
107
108 -def search_for(search, reldate=None, mindate=None, maxdate=None,
109 batchsize=100, delay=2, callback_fn=None,
110 start_id=0, max_ids=None):
111 """search_for(search[, reldate][, mindate][, maxdate]
112 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids
113
114 Search PubMed and return a list of the PMID's that match the
115 criteria. search is the search string used to search the
116 database. reldate is the number of dates prior to the current
117 date to restrict the search. mindate and maxdate are the dates to
118 restrict the search, e.g. 2002/01/01. batchsize specifies the
119 number of ids to return at one time. By default, it is set to
120 10000, the maximum. delay is the number of seconds to wait
121 between queries (default 2). callback_fn is an optional callback
122 function that will be called as passed a PMID as results are
123 retrieved. start_id specifies the index of the first id to
124 retrieve and max_ids specifies the maximum number of id's to
125 retrieve.
126
127 XXX The date parameters don't seem to be working with NCBI's
128 script. Please let me know if you can get it to work.
129
130 """
131 class ResultParser(sgmllib.SGMLParser):
132
133
134
135
136
137 def __init__(self):
138 sgmllib.SGMLParser.__init__(self)
139 self.ids = []
140 self.in_id = 0
141 def start_id(self, attributes):
142 self.in_id = 1
143 def end_id(self):
144 self.in_id = 0
145 _not_pmid_re = re.compile(r'\D')
146 def handle_data(self, data):
147 if not self.in_id:
148 return
149
150 data = string.strip(data)
151 if not data:
152 return
153
154
155
156
157 if self._not_pmid_re.search(data):
158 raise SyntaxError, \
159 "I expected an ID, but %s doesn't look like one." % \
160 repr(data)
161 self.ids.append(data)
162
163 params = {
164 'db' : 'pubmed',
165 'term' : search,
166 'reldate' : reldate,
167 'mindate' : mindate,
168 'maxdate' : maxdate
169 }
170 for k, v in params.items():
171 if v is None:
172 del params[k]
173
174 limiter = RequestLimiter(delay)
175 ids = []
176 while max_ids is None or len(ids) < max_ids:
177 parser = ResultParser()
178
179
180
181 limiter.wait()
182
183 start = start_id + len(ids)
184 max = batchsize
185 if max_ids is not None and max > max_ids - len(ids):
186 max = max_ids - len(ids)
187
188 params['retstart'] = start
189 params['retmax'] = max
190 h = NCBI.esearch(**params)
191 parser.feed(h.read())
192 ids.extend(parser.ids)
193 if callback_fn is not None:
194
195 for id in parser.ids:
196 callback_fn(id)
197 if len(parser.ids) < max or not parser.ids:
198 break
199 return ids
200
223 def start_id(self, attributes):
224 self.in_id = 1
225 def end_id(self):
226 self.in_id = 0
227 def start_link(self, attributes):
228 self.in_link = 1
229 def end_link(self):
230 self.in_link = 0
231 _not_pmid_re = re.compile(r'\D')
232 def handle_data(self, data):
233 if not self.in_link or not self.in_id:
234 return
235
236
237
238
239 if self._not_pmid_re.search(data):
240 raise SyntaxError, \
241 "I expected an ID, but '%s' doesn't look like one." % \
242 repr(data)
243 self.ids.append(data)
244
245 parser = ResultParser()
246 if type(pmid) is type([]):
247 pmid = string.join(pmid, ',')
248 h = NCBI.elink(dbfrom='pubmed', id=pmid)
249 parser.feed(h.read())
250 return parser.ids
251
252 -def download_many(ids, callback_fn, broken_fn=None, delay=120.0, faildelay=5.0,
253 batchsize=500, parser=None):
254 """download_many(ids, callback_fn[, broken_fn][, delay][, faildelay][, batchsize])
255
256 Download many records from PubMed. ids is a list of either the
257 Medline Unique ID or the PubMed ID's of the articles. Each time a
258 record is downloaded, callback_fn is called with the text of the
259 record. broken_fn is an optional function that is called with the
260 id of records that were not able to be downloaded. delay is the
261 number of seconds to wait between requests. batchsize is the
262 number of records to request each time.
263
264 """
265
266
267
268
269 if batchsize > 500 or batchsize < 1:
270 raise ValueError, "batchsize must be between 1 and 500"
271 limiter = RequestLimiter(delay)
272 current_batchsize = batchsize
273
274
275
276
277
278
279
280
281
282
283
284 nsuccesses = 0
285 while ids:
286 if current_batchsize > len(ids):
287 current_batchsize = len(ids)
288
289 id_str = ','.join(ids[:current_batchsize])
290
291
292 if not nsuccesses:
293 limiter.wait(faildelay)
294 else:
295 limiter.wait()
296 try:
297
298
299 handle = NCBI.efetch(
300 db="pubmed", id=id_str, retmode='text', rettype='medlars')
301
302
303
304
305
306 results = handle.read()
307 num_ids = 0
308 for x in Medline.Iterator(File.StringHandle(results)):
309 num_ids = num_ids + 1
310 if num_ids != current_batchsize:
311 raise IOError
312 handle = File.StringHandle(results)
313 except IOError:
314 if current_batchsize == 1:
315
316
317 id = ids.pop(0)
318 if broken_fn is not None:
319 broken_fn(id)
320 else:
321
322
323 current_batchsize = current_batchsize / 2
324 nsuccesses = 0
325 continue
326 nsuccesses = nsuccesses + 1
327
328
329
330 idnum = 0
331 for rec in Medline.Iterator(handle, parser):
332 callback_fn(ids[idnum], rec)
333 idnum = idnum + 1
334
335 ids = ids[current_batchsize:]
336
337
338
339 if nsuccesses >= 2 and current_batchsize < batchsize:
340 current_batchsize = current_batchsize * 2
341 if current_batchsize > batchsize:
342 current_batchsize = batchsize
343