1
2
3
4
5
6 """
7 This module provides code to work the NCBI's XML format for Medline.
8
9 Functions:
10 choose_format Pick the right data format to use to index an XML file.
11 index Index a Medline XML file.
12 index_many Index multiple Medline XML files.
13
14 """
15
16
17 import warnings
18 warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
19
20
21
22
23 import os
24 import types
25 from xml.sax import handler
26
27 from Bio.ParserSupport import *
28 from Bio import MultiProc
29
30 import Martel
31
55
57 """Holds information about a Medline citation.
58
59 Members:
60 medline_id Medline ID.
61 pmid Pubmed ID.
62
63 date_created Tuple of (year, month, day, season, medline date).
64 date_completed Tuple of (year, month, day, season, medline date).
65 date_revised Tuple of (year, month, day, season, medline date).
66
67 abstract Tuple of (text, copyright info).
68 journal Tuple of (ISSN, volume, issue, date).
69 article_title Title of article.
70 pagination Tuple of (start, end, medline pagination).
71 accession_numbers List of accession numbers.
72 affiliation Affiliation.
73 author_list List of authors.
74 languages List of languages
75 databank_list List of tuples (name, accession numbers).
76 grant_list List of tuples (grant id, acronym, agency)
77 publication_type_list List of publication types.
78 vernacular_title Vernacular title.
79
80
81 medline_journal_info Tuple of (country, medline ta, medline code, nlm id)
82 chemical_list List of (CAS registry number, name).
83 citation_subsets List of citation subsets.
84 comments_corrections XXX not implemented
85 gene_symbol_list List of gene symbols.
86 mesh_heading_list List of (descriptor, subheadings).
87 number_of_references Number of references (int).
88 personal_name_subject_list List of personal names.
89
90 """
91 pass
92
94 """Parses a citation into a Record object.
95
96 """
98 raise NotImplementedError
99
101 """Handles the results from the nlmmedline_format. Saves the begin
102 and end of each record as an offset from the beginning of the parse.
103
104 """
106 """_IndexerHandler(found_citation_fn)
107
108 found_citation_fn is called with the PMID, MedlineID, start,
109 end where start and end are offsets from the beginning of the
110 parse, with slice semantics.
111
112 """
113 self._citation_fn = found_citation_fn
114 self._elements = []
115 self._offset = 0
116 self._start = None
117 self._pmid = ''
118 self._medline_id = ''
120 self._elements.append(name)
121 if name == 'MedlineCitation':
122 if self._start is not None:
123 raise SyntaxError, "Found MedlineCitation, but already in one."
124 self._start = self._offset
126 if not self._elements or self._elements[-1] != name:
127 raise SyntaxError, "Elements not nested: %s" % name
128 self._elements.pop()
129 if name == 'MedlineCitation':
130 if not self._pmid or not self._medline_id:
131 raise SyntaxError, "I couldn't find an id: %s %s" % (
132 self._pmid, self._medline_id)
133 self._citation_fn(
134 self._pmid, self._medline_id, self._start, self._offset)
135 self._start = None
136 self._pmid = self._medline_id = ''
138 self._offset += len(content)
139
140 if len(self._elements)>=2 and self._elements[-2] == "MedlineCitation":
141 if self._elements[-1] == "PMID":
142 self._pmid = content
143 elif self._elements[-1] == "MedlineID":
144 self._medline_id = content
145
150 - def read(self, length=None):
159
160 -def index(handle, index_fn=None):
161 """index(handle[, index_fn]) -> list of (PMID, MedlineID, start, end)
162
163 Index a Medline XML file. Returns where the records are, as
164 offsets from the beginning of the handle. index_fn is a callback
165 function with parameters (PMID, MedlineID, start, end) and is
166 called as soon as each record is indexes.
167
168 """
169
170 data = handle.read(1000)
171 format_module = choose_format(data)
172 handle = _SavedDataHandle(handle, data)
173 format = format_module.format
174 wanted = ["MedlineCitation", "PMID", "MedlineID"]
175 format = Martel.select_names(format, wanted)
176
177
178
179 indexes = []
180 def citation_fn(pmid, medline_id, start, end,
181 indexes=indexes, index_fn=index_fn):
182 if index_fn is not None:
183 index_fn(pmid, medline_id, start, end)
184 indexes.append((pmid, medline_id, start, end))
185 indexer = _IndexerHandler(citation_fn)
186
187
188 parser = format.make_parser(debug_level=0)
189 parser.setContentHandler(indexer)
190 parser.setErrorHandler(handler.ErrorHandler())
191 parser.parseFile(handle)
192 return indexes
193
194 -def index_many(files_or_paths, index_fn, nprocs=1):
195 """index_many(files_or_paths, index_fn[, nprocs])
196
197 Index multiple Medline XML files. files_or_paths can be a single
198 file, a path, a list of files, or a list of paths.
199
200 index_fn is a callback function that should take the following
201 parameters:
202 index_fn(file, event, data)
203
204 where file is the file being indexed, event is one of "START",
205 "RECORD", "END", and data is extra data dependent upon the event.
206 "START" and "END" events are passed to indicate when a file is
207 being indexed. "RECORD" is passed whenever a new record has been
208 indexed. When a "RECORD" event is passed, then data is set to a
209 tuple of (pmid, medline_id, start, end). Otherwise it is None.
210 start and end indicate the location of the record as offsets from
211 the beginning of the file.
212
213 """
214
215
216
217 if type(files_or_paths) not in [types.ListType, types.TupleType]:
218 files_or_paths = [files_or_paths]
219
220 files = []
221 for f in files_or_paths:
222 if os.path.isfile(f):
223 files.append(f)
224 elif os.path.isdir(f):
225 names = os.listdir(f)
226 for name in names:
227 files.append(os.path.join(f, name))
228 else:
229 raise ValueError, "I can't find %s" % f
230
231 def do_some(start, skip, files, index_fn):
232 for i in range(start, len(files), skip):
233 infile = files[i]
234 index_fn(infile, "START", None)
235
236
237
238
239 def index_fn_adapter(pmid, medline_id, start, end,
240 infile=infile, index_fn=index_fn):
241 index_fn(infile, "RECORD", (pmid, medline_id, start, end))
242 index(open(infile), index_fn_adapter)
243 index_fn(infile, "END", None)
244 MultiProc.run(nprocs, do_some, fn_args=(files, index_fn))
245