Package Bio :: Package Medline :: Module NLMMedlineXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Medline.NLMMedlineXML

  1  # Copyright 2001 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work the NCBI's XML format for Medline. 
  8   
  9  Functions: 
 10  choose_format   Pick the right data format to use to index an XML file. 
 11  index           Index a Medline XML file. 
 12  index_many      Index multiple Medline XML files. 
 13   
 14  """ 
 15   
 16   
 17  import warnings 
 18  warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 19   
 20   
 21  # To Do: 
 22  # - Implement CitationParser 
 23  import os 
 24  import types 
 25  from xml.sax import handler 
 26   
 27  from Bio.ParserSupport import * 
 28  from Bio import MultiProc 
 29   
 30  import Martel 
 31   
32 -def choose_format(data):
33 """choose_format(data) -> module 34 35 Look at some data and choose the right format to parse it. data 36 should be the first 1000 characters or so of the file. The module 37 will contain 2 attributes: citation_format and format. 38 citation_format is a Martel format to parse one citation. format 39 will parse the whole file. 40 41 """ 42 formats = [ 43 ("nlmmedline_001211", "nlmmedline_001211_format"), 44 ("nlmmedline_010319", "nlmmedline_010319_format"), 45 ("nlmmedline_011101", "nlmmedline_011101_format"), 46 ("nlmmedline_031101", "nlmmedline_031101_format"), 47 ] 48 for identifier, format_module in formats: 49 if data.find(identifier) >= 0: 50 break 51 else: 52 raise AssertionError, "I could not identify that format." 53 package = '.'.join(["Bio", "Medline", format_module]) 54 return __import__(package, {}, {}, ["*"])
55
56 -class Citation:
57 """Holds information about a Medline citation. 58 59 Members: 60 medline_id Medline ID. 61 pmid Pubmed ID. 62 63 date_created Tuple of (year, month, day, season, medline date). 64 date_completed Tuple of (year, month, day, season, medline date). 65 date_revised Tuple of (year, month, day, season, medline date). 66 67 abstract Tuple of (text, copyright info). 68 journal Tuple of (ISSN, volume, issue, date). 69 article_title Title of article. 70 pagination Tuple of (start, end, medline pagination). 71 accession_numbers List of accession numbers. 72 affiliation Affiliation. 73 author_list List of authors. 74 languages List of languages 75 databank_list List of tuples (name, accession numbers). 76 grant_list List of tuples (grant id, acronym, agency) 77 publication_type_list List of publication types. 78 vernacular_title Vernacular title. 79 80 81 medline_journal_info Tuple of (country, medline ta, medline code, nlm id) 82 chemical_list List of (CAS registry number, name). 83 citation_subsets List of citation subsets. 84 comments_corrections XXX not implemented 85 gene_symbol_list List of gene symbols. 86 mesh_heading_list List of (descriptor, subheadings). 87 number_of_references Number of references (int). 88 personal_name_subject_list List of personal names. 89 90 """ 91 pass
92
93 -class CitationParser(AbstractParser):
94 """Parses a citation into a Record object. 95 96 """
97 - def __init__(self):
98 raise NotImplementedError
99
100 -class _IndexerHandler(handler.ContentHandler):
101 """Handles the results from the nlmmedline_format. Saves the begin 102 and end of each record as an offset from the beginning of the parse. 103 104 """
105 - def __init__(self, found_citation_fn):
106 """_IndexerHandler(found_citation_fn) 107 108 found_citation_fn is called with the PMID, MedlineID, start, 109 end where start and end are offsets from the beginning of the 110 parse, with slice semantics. 111 112 """ 113 self._citation_fn = found_citation_fn 114 self._elements = [] # Open element tags. 115 self._offset = 0 # Current file offset. 116 self._start = None # Offset of the start of the record. 117 self._pmid = '' 118 self._medline_id = ''
119 - def startElement(self, name, attrs):
120 self._elements.append(name) 121 if name == 'MedlineCitation': 122 if self._start is not None: 123 raise SyntaxError, "Found MedlineCitation, but already in one." 124 self._start = self._offset
125 - def endElement(self, name):
126 if not self._elements or self._elements[-1] != name: 127 raise SyntaxError, "Elements not nested: %s" % name 128 self._elements.pop() 129 if name == 'MedlineCitation': 130 if not self._pmid or not self._medline_id: # didn't find an ID: 131 raise SyntaxError, "I couldn't find an id: %s %s" % ( 132 self._pmid, self._medline_id) 133 self._citation_fn( 134 self._pmid, self._medline_id, self._start, self._offset) 135 self._start = None 136 self._pmid = self._medline_id = ''
137 - def characters(self, content):
138 self._offset += len(content) 139 # Examine the tags directly under <MedlineCitation>. 140 if len(self._elements)>=2 and self._elements[-2] == "MedlineCitation": 141 if self._elements[-1] == "PMID": 142 self._pmid = content 143 elif self._elements[-1] == "MedlineID": 144 self._medline_id = content
145
146 -class _SavedDataHandle:
147 - def __init__(self, handle, saved):
148 self.saved = saved 149 self.handle = handle
150 - def read(self, length=None):
151 if length is None: 152 data = self.saved + self.handle.read() 153 self.saved = '' 154 else: 155 data = self.saved[:length] 156 data += self.handle.read(length-len(data)) 157 self.saved = self.saved[length:] 158 return data
159
160 -def index(handle, index_fn=None):
161 """index(handle[, index_fn]) -> list of (PMID, MedlineID, start, end) 162 163 Index a Medline XML file. Returns where the records are, as 164 offsets from the beginning of the handle. index_fn is a callback 165 function with parameters (PMID, MedlineID, start, end) and is 166 called as soon as each record is indexes. 167 168 """ 169 # Find the correct format to parse the data. 170 data = handle.read(1000) 171 format_module = choose_format(data) 172 handle = _SavedDataHandle(handle, data) 173 format = format_module.format 174 wanted = ["MedlineCitation", "PMID", "MedlineID"] 175 format = Martel.select_names(format, wanted) 176 177 # Create an indexer that will save all the index information and 178 # call index_fn if appropriate. 179 indexes = [] 180 def citation_fn(pmid, medline_id, start, end, 181 indexes=indexes, index_fn=index_fn): 182 if index_fn is not None: 183 index_fn(pmid, medline_id, start, end) 184 indexes.append((pmid, medline_id, start, end))
185 indexer = _IndexerHandler(citation_fn) 186 187 # Create the parser and parse the results. 188 parser = format.make_parser(debug_level=0) 189 parser.setContentHandler(indexer) 190 parser.setErrorHandler(handler.ErrorHandler()) 191 parser.parseFile(handle) 192 return indexes 193
194 -def index_many(files_or_paths, index_fn, nprocs=1):
195 """index_many(files_or_paths, index_fn[, nprocs]) 196 197 Index multiple Medline XML files. files_or_paths can be a single 198 file, a path, a list of files, or a list of paths. 199 200 index_fn is a callback function that should take the following 201 parameters: 202 index_fn(file, event, data) 203 204 where file is the file being indexed, event is one of "START", 205 "RECORD", "END", and data is extra data dependent upon the event. 206 "START" and "END" events are passed to indicate when a file is 207 being indexed. "RECORD" is passed whenever a new record has been 208 indexed. When a "RECORD" event is passed, then data is set to a 209 tuple of (pmid, medline_id, start, end). Otherwise it is None. 210 start and end indicate the location of the record as offsets from 211 the beginning of the file. 212 213 """ 214 # This isn't a very good solution because it only allows 2 types 215 # of sequences. It's possible to use operator.isSequenceType, but 216 # then we have to figure out how to exclude String types. 217 if type(files_or_paths) not in [types.ListType, types.TupleType]: 218 files_or_paths = [files_or_paths] 219 220 files = [] 221 for f in files_or_paths: 222 if os.path.isfile(f): 223 files.append(f) 224 elif os.path.isdir(f): 225 names = os.listdir(f) 226 for name in names: 227 files.append(os.path.join(f, name)) 228 else: 229 raise ValueError, "I can't find %s" % f 230 231 def do_some(start, skip, files, index_fn): 232 for i in range(start, len(files), skip): 233 infile = files[i] 234 index_fn(infile, "START", None) 235 # index takes an optional index_fn with a different 236 # interface than the callback for this function. Thus, I 237 # have to make an adapter to change the interface to one 238 # that my client expects. 239 def index_fn_adapter(pmid, medline_id, start, end, 240 infile=infile, index_fn=index_fn): 241 index_fn(infile, "RECORD", (pmid, medline_id, start, end))
242 index(open(infile), index_fn_adapter) 243 index_fn(infile, "END", None) 244 MultiProc.run(nprocs, do_some, fn_args=(files, index_fn)) 245