Package Bio :: Package Medline
[hide private]
[frames] | no frames]

Source Code for Package Bio.Medline

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with Medline. 
  8   
  9  Classes: 
 10  Record           Holds Medline data. 
 11  Iterator         Iterates over a file containing Medline records. 
 12  RecordParser     Parses a Medline record into a Record object. 
 13   
 14  _Scanner         Scans a Medline record. 
 15  _RecordConsumer  Consumes Medline data to a Record object. 
 16   
 17  """ 
 18  from types import * 
 19   
 20  from Bio import File 
 21  from Bio.ParserSupport import * 
 22   
23 -class Record:
24 """Holds information from a Medline record. 25 26 Members: 27 id Medline ID. 28 pubmed_id Pubmed ID. 29 30 mesh_headings List of MeSH headings. 31 mesh_tree_numbers List of MeSH Tree Numbers. 32 mesh_subheadings List of MeSH subheadings. 33 34 abstract The abstract. 35 comments List of references to comments. 36 abstract_author The author of the abstract. 37 english_abstract "A" if a foreign article has an english abstract. 38 39 source Bibliographic information. 40 publication_types List of type of publication. 41 number_of_references Number of bibliographic references, for REVIEW pubs. 42 43 authors List of authors. 44 no_author "A" for anonymous. 45 address Address of the first author. 46 47 journal_title_code Three-character code assigned to the journal. 48 title_abbreviation Abbreviation of journal title. 49 issn International Standard Serial Number. 50 journal_subsets List of strings that describe journal groupings. 51 country Country of publication 52 languages List of languages of the article. 53 54 title Article title. 55 transliterated_title Title in the original language. 56 call_number The call number of the journal issue. 57 issue_part_supplement Issue, part, or supplement of journal published. 58 volume_issue Volume number of journal. 59 publication_date Date published (string). 60 year Year published (string). 61 pagination Inclusive pages of an indexed item. 62 63 special_list Coding for the database of the citation. 64 65 substance_name Preferred name for a chemical or drug. 66 gene_symbols List of abbreviated gene names. 67 secondary_source_ids List of source databanks and accessions. 68 identifications List of research grant or contract numbers. 69 registry_numbers List of CAS or EC numbers. 70 71 personal_name_as_subjects List of individuals who are subjects. 72 73 record_originators List of people who worked on record. 74 entry_date Date record made machine readable (YYMMDD). 75 entry_month YYMM entered into Medline. 76 class_update_date Date touched by Class Maintenance action (string). 77 last_revision_date Date for minor revision. 78 major_revision_date Date for major revision. 79 80 undefined List of lines that don't match the standard. 81 82 """
83 - def __init__(self):
84 self.id = '' 85 self.pubmed_id = '' 86 87 self.mesh_headings = [] 88 self.mesh_tree_numbers = [] 89 self.mesh_subheadings = [] 90 91 self.abstract = '' 92 self.comments = [] 93 self.abstract_author = '' 94 self.english_abstract = '' 95 96 self.source = '' 97 self.publication_types = [] 98 self.number_of_references = '' 99 100 self.authors = [] 101 self.no_author = '' 102 self.address = '' 103 104 self.journal_title_code = '' 105 self.title_abbreviation = '' 106 self.issn = '' 107 self.journal_subsets = [] 108 self.country = '' 109 self.languages = [] 110 111 self.title = '' 112 self.transliterated_title = '' 113 self.call_number = '' 114 self.issue_part_supplement = '' 115 self.volume_issue = '' 116 self.publication_date = '' 117 self.year = '' 118 self.pagination = '' 119 120 self.special_list = '' 121 122 self.substance_name = '' 123 self.gene_symbols = [] 124 self.secondary_source_ids = [] 125 self.identifications = [] 126 self.registry_numbers = [] 127 128 self.personal_name_as_subjects = [] 129 130 self.record_originators = [] 131 self.entry_date = '' 132 self.entry_month = '' 133 self.class_update_date = '' 134 self.last_revision_date = '' 135 self.major_revision_date = '' 136 137 self.undefined = []
138
139 -class Iterator:
140 """Returns one record at a time from a file of Medline records. 141 142 Methods: 143 next Return the next record from the stream, or None. 144 145 """
146 - def __init__(self, handle, parser=None):
147 """__init__(self, handle, parser=None) 148 149 Create a new iterator. handle is a file-like object. parser 150 is an optional Parser object to change the results into another form. 151 If set to None, then the raw contents of the file will be returned. 152 153 """ 154 if type(handle) is not FileType and type(handle) is not InstanceType: 155 raise ValueError, "I expected a file handle or file-like object" 156 self._uhandle = File.UndoHandle(handle) 157 self._parser = parser
158
159 - def __iter__(self):
160 return self
161
162 - def next(self):
163 """next(self) -> object 164 165 Return the next medline record from the file. If no more records, 166 return None. 167 168 """ 169 lines = [] 170 while 1: 171 line = self._uhandle.readline() 172 if not line: 173 break 174 lines.append(line) 175 if string.rstrip(line) == '': 176 break 177 while 1: # read remaining blank lines 178 line = self._uhandle.readline() 179 if not line: 180 break 181 if string.rstrip(line) != '': 182 self._uhandle.saveline(line) 183 break 184 lines.append(line) 185 186 if not lines: 187 raise StopIteration 188 189 data = string.join(lines, '') 190 if self._parser is not None: 191 return self._parser.parse_str(data) 192 return data
193
194 -class RecordParser(AbstractParser):
195 """Parses Medline data into a Record object. 196 197 """
198 - def __init__(self):
199 self._scanner = _Scanner() 200 self._consumer = _RecordConsumer()
201
202 - def parse(self, handle):
203 self._scanner.feed(handle, self._consumer) 204 return self._consumer.data
205
206 -class _Scanner:
207 """Scans a Medline record. 208 209 """ 210 # map the category qualifier to an event 211 _categories = { 212 "AA" : "abstract_author", 213 "AB" : "abstract", 214 "AD" : "address", 215 "AU" : "author", 216 "CA" : "call_number", 217 "CM" : "comments", 218 "CU" : "class_update_date", 219 "CY" : "country", 220 "DA" : "entry_date", 221 "DP" : "publication_date", 222 "EA" : "english_abstract", 223 "EM" : "entry_month", 224 "GS" : "gene_symbol", 225 "ID" : "identification", 226 "IP" : "issue_part_supplement", 227 "IS" : "issn", 228 "JC" : "journal_title_code", 229 "LA" : "language", 230 "LI" : "special_list", 231 "LR" : "last_revision_date", 232 "MH" : "mesh_heading", 233 "MN" : "mesh_tree_number", 234 "MR" : "major_revision_date", 235 "NI" : "no_author", 236 "NM" : "substance_name", 237 "PG" : "pagination", 238 "PS" : "personal_name_as_subject", 239 "PT" : "publication_type", 240 "RF" : "number_of_references", 241 "RN" : "cas_registry_number", 242 "RO" : "record_originator", 243 "SB" : "journal_subset", 244 "SH" : "subheadings", 245 "SI" : "secondary_source_id", 246 "SO" : "source", 247 "TA" : "title_abbreviation", 248 "TI" : "title", 249 "TT" : "transliterated_title", 250 "UI" : "unique_identifier", 251 "VI" : "volume_issue", 252 "YR" : "year", 253 254 # Not documented. 255 "PMID" : "pubmed_id", 256 } 257
258 - def feed(self, handle, consumer):
259 """feed(self, handle, consumer) 260 261 Feed in a Medline unit record for scanning. handle is a file-like 262 object that contains a Medline record. consumer is a 263 Consumer object that will receive events as the report is scanned. 264 265 """ 266 if isinstance(handle, File.UndoHandle): 267 uhandle = handle 268 else: 269 uhandle = File.UndoHandle(handle) 270 271 # Read the Entrez header information, if it exists 272 if attempt_read_and_call(uhandle, consumer.noevent, start='Entrez'): 273 read_and_call(uhandle, consumer.noevent, start='----------------') 274 self._scan_record(uhandle, consumer)
275
276 - def _scan_record(self, uhandle, consumer):
277 consumer.start_record() 278 279 prev_qualifier = None 280 while 1: 281 line = uhandle.readline() 282 if is_blank_line(line): 283 break 284 285 # There are 2 possible formats for a line: 286 # TI - Epidemiology of mycobacterial resistance (especially Mycoba 287 # tuberculosis). 288 # 1) qualifier + '-' + data 289 # 2) continuation, with just data 290 291 # Check to see if it's a continuation line. 292 qualifier = string.rstrip(line[:4]) 293 # There's a bug in some MH lines where the "isolation & 294 # purification" subheading gets split across lines and 295 # purification at the beginning of the line, with only 1 296 # space. 297 if line[0] == '\t' or qualifier == '' or \ 298 line[:13] == ' purification': 299 if prev_qualifier is None: 300 raise SyntaxError, "Continuation on first line\n%s" % line 301 qualifier = prev_qualifier 302 else: 303 # Make sure it contains a '-' 304 if len(line) < 5 or line[4] != '-': 305 raise SyntaxError, \ 306 "I don't understand the format of line %s" % line 307 prev_qualifier = qualifier 308 309 try: 310 fn = getattr(consumer, self._categories[qualifier]) 311 except KeyError: 312 # call an 'undefined' function for 313 consumer.undefined(line) 314 else: 315 fn(line) 316 317 consumer.end_record()
318
319 -class _RecordConsumer(AbstractConsumer):
320 """Consumer that converts a Medline record to a Record object. 321 322 Members: 323 data Record with Medline data. 324 325 """
326 - def __init__(self):
327 self.data = None
328
329 - def start_record(self):
330 self.data = Record()
331
332 - def end_record(self):
333 self._clean_record(self.data)
334
335 - def abstract_author(self, line):
336 self.data.abstract_author = self._clean(line)
337
338 - def abstract(self, line):
339 self.data.abstract = self.data.abstract + self._clean(line, rstrip=0)
340
341 - def address(self, line):
342 self.data.address = self.data.address + self._clean(line, rstrip=0)
343
344 - def author(self, line):
345 self.data.authors.append(self._clean(line))
346
347 - def call_number(self, line):
348 assert not self.data.call_number, "call numbers already defined" 349 self.data.call_number = self._clean(line)
350
351 - def comments(self, line):
352 self.data.comments.append(self._clean(line))
353
354 - def class_update_date(self, line):
355 assert not self.data.class_update_date, \ 356 "class update date already defined" 357 self.data.class_update_date = self._clean(line)
358
359 - def country(self, line):
360 assert not self.data.country, "country already defined" 361 self.data.country = self._clean(line)
362
363 - def entry_date(self, line):
364 assert not self.data.entry_date, "entry date already defined" 365 self.data.entry_date = self._clean(line)
366
367 - def publication_date(self, line):
368 assert not self.data.publication_date, \ 369 "publication date already defined" 370 self.data.publication_date = self._clean(line)
371
372 - def english_abstract(self, line):
373 assert not self.data.english_abstract, \ 374 "english abstract already defined" 375 self.data.english_abstract = self._clean(line)
376
377 - def entry_month(self, line):
378 assert not self.data.entry_month, \ 379 "entry month already defined" 380 self.data.entry_month = self._clean(line)
381
382 - def gene_symbol(self, line):
383 self.data.gene_symbols.append(self._clean(line))
384
385 - def identification(self, line):
386 self.data.identifications.append(self._clean(line))
387
388 - def issue_part_supplement(self, line):
389 assert not self.data.issue_part_supplement, \ 390 "issue/part/supplement already defined" 391 self.data.issue_part_supplement = self._clean(line)
392
393 - def issn(self, line):
394 assert not self.data.issn, "ISSN already defined" 395 self.data.issn = self._clean(line)
396
397 - def journal_title_code(self, line):
398 assert not self.data.journal_title_code, \ 399 "journal title code already defined" 400 self.data.journal_title_code = self._clean(line)
401
402 - def language(self, line):
403 self.data.languages.append(self._clean(line))
404
405 - def special_list(self, line):
406 assert not self.data.special_list, "special list already defined" 407 self.data.special_list = self._clean(line)
408
409 - def last_revision_date(self, line):
410 assert not self.data.last_revision_date, \ 411 "last revision date already defined" 412 self.data.last_revision_date = self._clean(line)
413
414 - def mesh_heading(self, line):
415 # Check to see whether this is a new MH line, or a 416 # continuation of an old one. If it's a continuation of an 417 # old one, append it to the previous line. 418 # See PMID 12107064 for an example, found by Dan Rubin. 419 if line[:2] == 'MH': 420 self.data.mesh_headings.append(self._clean(line)) 421 else: 422 prev_mh = self.data.mesh_headings.pop() 423 continued_mh = self._clean(line) 424 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
425
426 - def mesh_tree_number(self, line):
427 self.data.mesh_tree_numbers.append(self._clean(line))
428
429 - def major_revision_date(self, line):
430 assert not self.data.major_revision_date, \ 431 "major revision date already defined" 432 self.data.major_revision_date = self._clean(line)
433
434 - def no_author(self, line):
435 assert not self.data.no_author, "no author already defined" 436 self.data.no_author = self._clean(line)
437
438 - def substance_name(self, line):
439 assert not self.data.substance_name, "substance name already defined" 440 self.data.substance_name = self._clean(line)
441
442 - def pagination(self, line):
443 assert not self.data.pagination, "pagination already defined" 444 self.data.pagination = self._clean(line)
445
446 - def personal_name_as_subject(self, line):
447 self.data.personal_name_as_subjects.append(self._clean(line))
448
449 - def publication_type(self, line):
450 self.data.publication_types.append(self._clean(line))
451
452 - def number_of_references(self, line):
453 assert not self.data.number_of_references, \ 454 "num of references already defined" 455 self.data.number_of_references = self._clean(line)
456
457 - def cas_registry_number(self, line):
458 self.data.registry_numbers.append(self._clean(line))
459
460 - def record_originator(self, line):
461 self.data.record_originators.append(self._clean(line))
462
463 - def journal_subset(self, line):
464 self.data.journal_subsets.append(self._clean(line))
465
466 - def subheadings(self, line):
467 self.data.mesh_subheadings.append(self._clean(line))
468
469 - def secondary_source_id(self, line):
470 self.data.secondary_source_ids.append(self._clean(line))
471
472 - def source(self, line):
473 self.data.source = self.data.source + self._clean(line, rstrip=0)
474
475 - def title_abbreviation(self, line):
476 self.data.title_abbreviation = self.data.title_abbreviation + \ 477 self._clean(line, rstrip=0)
478
479 - def title(self, line):
480 self.data.title = self.data.title + self._clean(line, rstrip=0)
481
482 - def transliterated_title(self, line):
483 self.data.transliterated_title = self.data.transliterated_title + \ 484 self._clean(line, rstrip=0)
485
486 - def unique_identifier(self, line):
487 assert not self.data.id, "id already defined" 488 self.data.id = self._clean(line)
489
490 - def volume_issue(self, line):
491 assert not self.data.volume_issue, "volume issue already defined" 492 self.data.volume_issue = self._clean(line)
493
494 - def year(self, line):
495 assert not self.data.year, "year already defined" 496 self.data.year = self._clean(line)
497
498 - def pubmed_id(self, line):
499 assert not self.data.pubmed_id, "PMID already defined" 500 self.data.pubmed_id = self._clean(line)
501
502 - def undefined(self, line):
503 # Records sometimes contain lines with qualifiers that don't match 504 # any in the standard. All these lines go into another variable. 505 # Some undefined qualifiers: 506 # 4098, 4099, 4100, 4101 507 # 634 508 # NP, PID, EDAT, MHDA 509 510 self.data.undefined.append(line)
511
512 - def _clean(self, line, rstrip=1):
513 tab = string.find(line, '\t') 514 if tab >= 0: 515 nospace = line[tab+1:] 516 elif line[:13] == ' purification': 517 nospace = line[1:] 518 else: 519 nospace = line[6:] 520 if rstrip: 521 return string.rstrip(nospace) 522 return nospace
523 524 _needs_stripping = [ 525 'abstract', 'source', 'address', 'title_abbreviation', 526 'title', 'transliterated_title' 527 ]
528 - def _clean_record(self, rec):
529 # Remove trailing newlines 530 for m in self._needs_stripping: 531 value = getattr(rec, m) 532 setattr(rec, m, string.rstrip(value))
533