Package Bio :: Package EUtils :: Module HistoryClient
[hide private]
[frames] | no frames]

Source Code for Module Bio.EUtils.HistoryClient

  1  """Search and retreive information using the EUtils history. 
  2   
  3  EUtils has two major modes.  One uses history while the other uses 
  4  database identifiers.  This is a high-level interface for working with 
  5  the history.  You should use this module if you expect to work with 
  6  large or an unknown number of identifiers. 
  7   
  8  See DBIdsClient if you want to get information about a set of known 
  9  database identifiers. 
 10   
 11  >>> from Bio.EUtils import HistoryClient 
 12  >>> client = HistoryClient.HistoryClient() 
 13  >>> cancer = client.search("cancer") 
 14  >>> print len(cancer) 
 15  1458353 
 16  >>>  
 17   
 18  That's quite a few hits.  Most people would like to see the first few 
 19  records then try to refine the search. 
 20   
 21  >>> print cancer[:5].efetch(retmode = "text", rettype = "docsum").read() 
 22   
 23  1:  Seow-Choen F. 
 24  Author's reply: Adjuvant therapy for rectal cancer cannot be based on the 
 25  results of other surgeons (Br J Surg 2002; 89: 946-947). 
 26  Br J Surg. 2003 Jan;90(1):121-122. 
 27  PMID: 12520589 [PubMed - as supplied by publisher] 
 28   
 29  2:  Mortensen N, Lindsey I. 
 30  Adjuvant therapy for rectal cancer cannot be based on the results of other 
 31  surgeons (Br J Surg 2002; 89: 946-947). 
 32  Br J Surg. 2003 Jan;90(1):121. 
 33  PMID: 12520588 [PubMed - as supplied by publisher] 
 34   
 35  3:  Osugi H, Takemura M, Higashino M, Takada N, Lee S, Kinoshita H. 
 36  A comparison of video-assisted thoracoscopic oesophagectomy and radical lymph 
 37  node dissection for squamous cell cancer of the oesophagus with open operation. 
 38  Br J Surg. 2003 Jan;90(1):108-13. 
 39  PMID: 12520585 [PubMed - in process] 
 40   
 41  4:  Tanaka M, Kitajima Y, Sato S, Miyazaki K. 
 42  Combined evaluation of mucin antigen and E-cadherin expression may help select 
 43  patients with gastric cancer suitable for minimally invasive therapy. 
 44  Br J Surg. 2003 Jan;90(1):95-101. 
 45  PMID: 12520583 [PubMed - in process] 
 46   
 47  5:  Diaz De Liano A, Oteiza Martinez F, Ciga MA, Aizcorbe M, Cobo F, Trujillo R. 
 48  Impact of surgical procedure for gastric cancer on quality of life. 
 49  Br J Surg. 2003 Jan;90(1):91-4. 
 50  PMID: 12520582 [PubMed - in process] 
 51   
 52  >>> 
 53   
 54  Now refine the query to publications in the last day 
 55   
 56  >>> from Bio import EUtils 
 57  >>> recent_cancer = client.search("#%s" % (cancer.query_key,), 
 58  ...                               daterange = EUtils.WithinNDays(1)) 
 59  >>> len(recent_cancer) 
 60  106 
 61  >>> 
 62   
 63  Still quite a few.  What's the last one about? 
 64  >>> for k, v in recent_cancer[-1].summary().dataitems.allitems(): 
 65  ...     print k, "=", v 
 66  ... 
 67   
 68  PubDate = 2002/12/01 
 69  Source = Nippon Shokakibyo Gakkai Zasshi 
 70  Authors = Kuroki T 
 71  Title = [Strategy against cancer in 21 century, with emphasis of cancer prevention and refractory cancer] 
 72  Volume = 99 
 73  Pages = 1423-7 
 74  EntrezDate = 2003/01/10 
 75  PubMedId = 12518389 
 76  MedlineId = 22406828 
 77  Lang = Japanese 
 78  PubType = 
 79  RecordStatus = PubMed - in process 
 80  Issue = 12 
 81  SO = 2002 Dec;99(12):1423-7 
 82  DOI = 
 83  JTA = KJY 
 84  ISSN = 0446-6586 
 85  PubId = 
 86  PubStatus = 4 
 87  Status = 6 
 88  HasAbstract = 0 
 89  ArticleIds = {'MedlineUID': u'22406828', 'PubMedId': u'12518389'} 
 90  >>>  
 91   
 92  Here's an interesting one.  Which articles are related to this one but 
 93  are not about cancer?  First, get the related articles. 
 94   
 95   
 96  >>> neighbors = recent_cancer[-1].neighbor_links() 
 97  >>> dbids = neighbors.linksetdbs["pubmed_pubmed"].dbids 
 98  >>> len(dbids) 
 99  10296 
100  >>>  
101   
102  Upload that back to the server 
103   
104  >>> related_result = client.post(dbids) 
105  >>> 
106  >>> non_cancer = client.search("#%s NOT #%s" % (related_result.query_key, 
107  ...                                             cancer.query_key)) 
108  >>> len(non_cancer) 
109  4000 
110  >>> 
111   
112  The HistoryClient instance has an attribute named 'query_history' 
113  which stores the searches done so far, keyed by the query_key value 
114  assigned by the server.  The history on the server can expire.  If 
115  that is detected during a search then previous results are invalidated 
116  and removed from the query_history.  Future requests from invalidated 
117  results will raise an error. 
118   
119  If a request is made from a search which has not been invalidated but 
120  whose history has expired then queries like 'summary' will raise an 
121  error.  Some other request (like 'dbids') may return success but 
122  contain undefined information. 
123   
124  """ #" 
125       
126  import types 
127  import ThinClient, parse, Datatypes, Mixins, Config 
128   
129 -class HistoryCookie:
130 """Data needed to get back to the history"""
131 - def __init__(self, db, webenv_ref, query_key):
132 self.db = db 133 self.webenv_ref = webenv_ref 134 self.query_key = query_key
135
136 -class HistoryLookup(object):
137 """Look up information about a search in history 138 139 To get the list of dbids by fetching the server's "uilist", 140 use the "dbids" attribute. 141 """
142 - def __init__(self, eutils, cookie, retstart, retmax):
143 self.eutils = eutils 144 self.cookie = cookie 145 self.retstart = retstart 146 self.retmax = retmax 147 self.db = cookie.db 148 self.query_key = cookie.query_key
149
150 - def _check_invalid(self):
151 # Check if we can get data from this history 152 if self.cookie.query_key is None: 153 raise NotImplementedError("empty data set") 154 if self.query_key is None: 155 raise Datatypes.EUtilsError( 156 "query history no longer available on server")
157
158 - def esummary(self, retmode = 'xml', rettype = None):
159 """Request the eSummary for this history; returns the socket handle""" 160 self._check_invalid() 161 infile = self.eutils.esummary_using_history( 162 webenv = self.cookie.webenv_ref[0], 163 db = self.cookie.db, 164 query_key = self.cookie.query_key, 165 retstart = self.retstart, 166 retmax = self.retmax) 167 return infile
168
169 - def summary(self):
170 """the Datatypes.Summary for this history""" 171 return parse.parse_summary_xml(self.esummary("xml"))
172 193
194 - def _get_dbids(self):
195 infile = self.efetch(retmode = "text", rettype = "uilist") 196 ids = parse.parse_fetch_identifiers(infile) 197 return Datatypes.DBIds(self.cookie.db, ids)
198 dbids = property(_get_dbids, None, None, 199 "The DBIds for this results set, fetched from the server's 'uilist'")
200
201 -class HistoryRecord(HistoryLookup):
202 """Get information about a single record in a history"""
203 - def __init__(self, eutils, cookie, offset):
204 HistoryLookup.__init__(self, eutils, cookie, offset, 1)
205 - def summary(self):
206 """the Datatypes.Summary for this history record""" 207 return HistoryLookup.summary(self)[0]
208
209 -class SequenceHistoryFetchMixin:
210 - def efetch(self, retmode = 'xml', rettype = None, 211 seq_start = None, seq_stop = None, strand = None, 212 complexity = None):
213 self._check_invalid() 214 if strand not in (None, 1, 2): 215 raise TypeError("Strand can only be 1 (plus, default) or 2 (minus)") 216 return self.eutils.efetch_using_history( 217 webenv = self.cookie.webenv_ref[0], 218 db = self.cookie.db, 219 query_key = self.cookie.query_key, 220 retstart = self.retstart, 221 retmax = self.retmax, 222 retmode = retmode, 223 rettype = rettype, 224 seq_start = seq_start, 225 seq_stop = seq_stop, 226 strand = strand, 227 complexity = complexity)
228
229 -class SequenceHistoryRecord(Mixins.SequenceFetchMixin, 230 SequenceHistoryFetchMixin, 231 HistoryRecord):
232 pass
233
234 -class PublicationHistoryFetchMixin:
235 - def efetch(self, retmode = "xml", rettype = None):
236 self._check_invalid() 237 return self.eutils.efetch_using_history( 238 webenv = self.cookie.webenv_ref[0], 239 db = self.cookie.db, 240 query_key = self.cookie.query_key, 241 retstart = self.retstart, 242 retmax = self.retmax, 243 retmode = retmode, 244 rettype = rettype)
245
246 -class PublicationHistoryRecord(Mixins.PublicationFetchMixin, 247 PublicationHistoryFetchMixin, 248 HistoryRecord):
249 pass
250
251 -class BaseHistoryRecordSet(HistoryLookup):
252 - def __init__(self, eutils, cookie, retstart, retmax, metadata = None):
253 HistoryLookup.__init__(self, eutils, cookie, retstart, retmax) 254 self.metadata = metadata
255
256 - def __len__(self):
257 return self.retmax
258
259 - def __getitem__(self, i):
260 if isinstance(i, types.SliceType): 261 if i.step is not None: 262 raise TypeError("cannot set step size in slice") 263 # Don't pass metadata downwards 264 start = i.start 265 if start is None: start = 0 266 stop = i.stop 267 if stop is None: stop = self.retmax 268 # Potentially expensive, but this is the easy way 269 # to get the semantics correct. 270 x = range(self.retstart, self.retstart + self.retmax)[start:stop] 271 if x: 272 retstart = x[0] 273 retmax = x[-1] - x[0] + 1 274 else: 275 retstart = 0 276 retmax = 0 277 return self.__class__(self.eutils, self.cookie, retstart, 278 retmax) 279 if 0 <= i < self.retmax: 280 pos = self.retstart + i 281 elif 1 <= -i <= self.retmax: 282 pos = self.retstart + i + self.retmax 283 else: 284 raise IndexError(i) 285 return self._record_class(self.eutils, self.cookie, pos)
286
287 -class SequenceHistoryRecordSet(Mixins.SequenceFetchMixin, 288 SequenceHistoryFetchMixin, 289 BaseHistoryRecordSet):
290 _record_class = SequenceHistoryRecord
291
292 -class PublicationHistoryRecordSet(Mixins.PublicationFetchMixin, 293 PublicationHistoryFetchMixin, 294 BaseHistoryRecordSet):
295 _record_class = PublicationHistoryRecord
296
297 -def _get_recordset_constructor(db, dbtype):
298 dbtype = Config.databases.gettype(db, dbtype) 299 if dbtype == Config.SEQUENCE_TYPE: 300 return SequenceHistoryRecordSet 301 elif dbtype == Config.PUBLICATION_TYPE: 302 return PublicationHistoryRecordSet 303 else: 304 raise TypeError("unknown database type: %r" % (dbtype,)) 305 return Hrec_set, Hrec
306
307 -class HistoryClient:
308 - def __init__(self, eutils = None):
309 if eutils is None: 310 eutils = ThinClient.ThinClient() 311 self.eutils = eutils 312 self.webenv_ref = [None] 313 self.query_history = {}
314
315 - def _check_for_cache_reset(self, query_key):
316 # If this is a repeat then it's because the history 317 # has expired. Set the existing results to have a 318 # query_key of None and reset the cache. 319 if query_key not in self.query_history: 320 # New value, so there was no reset 321 return 322 323 for v in self.query_history.values(): 324 v.query_key = None 325 326 self.query_history.clear()
327
328 - def search(self, 329 term, 330 db = "pubmed", 331 field = None, 332 daterange = None, 333 dbtype = None 334 ):
335 336 set_klass = _get_recordset_constructor(db, dbtype) 337 338 infile = self.eutils.esearch( 339 term = term, 340 db = db, 341 field = field, 342 343 retstart = 0, 344 retmax = 0, 345 346 daterange = daterange, 347 348 usehistory = 1, 349 webenv = self.webenv_ref[0], 350 ) 351 searchinfo = parse.parse_search(infile, self.webenv_ref) 352 353 if searchinfo.query_key is not None: 354 cookie = HistoryCookie(db, self.webenv_ref, searchinfo.query_key) 355 else: 356 assert searchinfo.count == 0 357 cookie = HistoryCookie(db, None, None) 358 359 recordset = set_klass(self.eutils, cookie, 0, searchinfo.count, 360 searchinfo) 361 # won't have a query_key if the search turned up empty 362 if searchinfo.query_key is not None: 363 self._check_for_cache_reset(searchinfo.query_key) 364 self.query_history[searchinfo.query_key] = recordset 365 366 return recordset
367
368 - def post(self, dbids, dbtype = None):
369 set_klass = _get_recordset_constructor(dbids.db, dbtype) 370 371 infile = self.eutils.epost(dbids, 372 webenv = self.webenv_ref[0]) 373 # Extract the webenv_ref since it may change. 374 postinfo = parse.parse_post(infile, self.webenv_ref) 375 376 # Were there any invalid identifiers? 377 n = len(dbids) - len(postinfo.invalid_ids) 378 379 cookie = HistoryCookie(dbids.db, self.webenv_ref, 380 postinfo.query_key) 381 recordset = set_klass(self.eutils, cookie, 0, n, postinfo) 382 self._check_for_cache_reset(postinfo.query_key) 383 self.query_history[postinfo.query_key] = recordset 384 return recordset
385 386 from_dbids = post # alias for similarity to DBIdsClient
387