1
2
3
4
5
6 """
7 This module provides code to work with Medline.
8
9 Classes:
10 Record Holds Medline data.
11 Iterator Iterates over a file containing Medline records.
12 RecordParser Parses a Medline record into a Record object.
13
14 _Scanner Scans a Medline record.
15 _RecordConsumer Consumes Medline data to a Record object.
16
17 """
18 from types import *
19
20 from Bio import File
21 from Bio.ParserSupport import *
22
24 """Holds information from a Medline record.
25
26 Members:
27 id Medline ID.
28 pubmed_id Pubmed ID.
29
30 mesh_headings List of MeSH headings.
31 mesh_tree_numbers List of MeSH Tree Numbers.
32 mesh_subheadings List of MeSH subheadings.
33
34 abstract The abstract.
35 comments List of references to comments.
36 abstract_author The author of the abstract.
37 english_abstract "A" if a foreign article has an english abstract.
38
39 source Bibliographic information.
40 publication_types List of type of publication.
41 number_of_references Number of bibliographic references, for REVIEW pubs.
42
43 authors List of authors.
44 no_author "A" for anonymous.
45 address Address of the first author.
46
47 journal_title_code Three-character code assigned to the journal.
48 title_abbreviation Abbreviation of journal title.
49 issn International Standard Serial Number.
50 journal_subsets List of strings that describe journal groupings.
51 country Country of publication
52 languages List of languages of the article.
53
54 title Article title.
55 transliterated_title Title in the original language.
56 call_number The call number of the journal issue.
57 issue_part_supplement Issue, part, or supplement of journal published.
58 volume_issue Volume number of journal.
59 publication_date Date published (string).
60 year Year published (string).
61 pagination Inclusive pages of an indexed item.
62
63 special_list Coding for the database of the citation.
64
65 substance_name Preferred name for a chemical or drug.
66 gene_symbols List of abbreviated gene names.
67 secondary_source_ids List of source databanks and accessions.
68 identifications List of research grant or contract numbers.
69 registry_numbers List of CAS or EC numbers.
70
71 personal_name_as_subjects List of individuals who are subjects.
72
73 record_originators List of people who worked on record.
74 entry_date Date record made machine readable (YYMMDD).
75 entry_month YYMM entered into Medline.
76 class_update_date Date touched by Class Maintenance action (string).
77 last_revision_date Date for minor revision.
78 major_revision_date Date for major revision.
79
80 undefined List of lines that don't match the standard.
81
82 """
138
140 """Returns one record at a time from a file of Medline records.
141
142 Methods:
143 next Return the next record from the stream, or None.
144
145 """
146 - def __init__(self, handle, parser=None):
147 """__init__(self, handle, parser=None)
148
149 Create a new iterator. handle is a file-like object. parser
150 is an optional Parser object to change the results into another form.
151 If set to None, then the raw contents of the file will be returned.
152
153 """
154 if type(handle) is not FileType and type(handle) is not InstanceType:
155 raise ValueError, "I expected a file handle or file-like object"
156 self._uhandle = File.UndoHandle(handle)
157 self._parser = parser
158
161
163 """next(self) -> object
164
165 Return the next medline record from the file. If no more records,
166 return None.
167
168 """
169 lines = []
170 while 1:
171 line = self._uhandle.readline()
172 if not line:
173 break
174 lines.append(line)
175 if string.rstrip(line) == '':
176 break
177 while 1:
178 line = self._uhandle.readline()
179 if not line:
180 break
181 if string.rstrip(line) != '':
182 self._uhandle.saveline(line)
183 break
184 lines.append(line)
185
186 if not lines:
187 raise StopIteration
188
189 data = string.join(lines, '')
190 if self._parser is not None:
191 return self._parser.parse_str(data)
192 return data
193
195 """Parses Medline data into a Record object.
196
197 """
201
202 - def parse(self, handle):
203 self._scanner.feed(handle, self._consumer)
204 return self._consumer.data
205
207 """Scans a Medline record.
208
209 """
210
211 _categories = {
212 "AA" : "abstract_author",
213 "AB" : "abstract",
214 "AD" : "address",
215 "AU" : "author",
216 "CA" : "call_number",
217 "CM" : "comments",
218 "CU" : "class_update_date",
219 "CY" : "country",
220 "DA" : "entry_date",
221 "DP" : "publication_date",
222 "EA" : "english_abstract",
223 "EM" : "entry_month",
224 "GS" : "gene_symbol",
225 "ID" : "identification",
226 "IP" : "issue_part_supplement",
227 "IS" : "issn",
228 "JC" : "journal_title_code",
229 "LA" : "language",
230 "LI" : "special_list",
231 "LR" : "last_revision_date",
232 "MH" : "mesh_heading",
233 "MN" : "mesh_tree_number",
234 "MR" : "major_revision_date",
235 "NI" : "no_author",
236 "NM" : "substance_name",
237 "PG" : "pagination",
238 "PS" : "personal_name_as_subject",
239 "PT" : "publication_type",
240 "RF" : "number_of_references",
241 "RN" : "cas_registry_number",
242 "RO" : "record_originator",
243 "SB" : "journal_subset",
244 "SH" : "subheadings",
245 "SI" : "secondary_source_id",
246 "SO" : "source",
247 "TA" : "title_abbreviation",
248 "TI" : "title",
249 "TT" : "transliterated_title",
250 "UI" : "unique_identifier",
251 "VI" : "volume_issue",
252 "YR" : "year",
253
254
255 "PMID" : "pubmed_id",
256 }
257
258 - def feed(self, handle, consumer):
275
277 consumer.start_record()
278
279 prev_qualifier = None
280 while 1:
281 line = uhandle.readline()
282 if is_blank_line(line):
283 break
284
285
286
287
288
289
290
291
292 qualifier = string.rstrip(line[:4])
293
294
295
296
297 if line[0] == '\t' or qualifier == '' or \
298 line[:13] == ' purification':
299 if prev_qualifier is None:
300 raise SyntaxError, "Continuation on first line\n%s" % line
301 qualifier = prev_qualifier
302 else:
303
304 if len(line) < 5 or line[4] != '-':
305 raise SyntaxError, \
306 "I don't understand the format of line %s" % line
307 prev_qualifier = qualifier
308
309 try:
310 fn = getattr(consumer, self._categories[qualifier])
311 except KeyError:
312
313 consumer.undefined(line)
314 else:
315 fn(line)
316
317 consumer.end_record()
318
320 """Consumer that converts a Medline record to a Record object.
321
322 Members:
323 data Record with Medline data.
324
325 """
328
331
334
337
340
343
346
350
353
358
362
363 - def entry_date(self, line):
364 assert not self.data.entry_date, "entry date already defined"
365 self.data.entry_date = self._clean(line)
366
371
376
377 - def entry_month(self, line):
378 assert not self.data.entry_month, \
379 "entry month already defined"
380 self.data.entry_month = self._clean(line)
381
384
387
392
393 - def issn(self, line):
396
401
404
408
413
415
416
417
418
419 if line[:2] == 'MH':
420 self.data.mesh_headings.append(self._clean(line))
421 else:
422 prev_mh = self.data.mesh_headings.pop()
423 continued_mh = self._clean(line)
424 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
425
428
433
437
441
443 assert not self.data.pagination, "pagination already defined"
444 self.data.pagination = self._clean(line)
445
448
451
456
459
462
465
468
471
474
478
481
485
489
493
494 - def year(self, line):
497
501
511
512 - def _clean(self, line, rstrip=1):
513 tab = string.find(line, '\t')
514 if tab >= 0:
515 nospace = line[tab+1:]
516 elif line[:13] == ' purification':
517 nospace = line[1:]
518 else:
519 nospace = line[6:]
520 if rstrip:
521 return string.rstrip(nospace)
522 return nospace
523
524 _needs_stripping = [
525 'abstract', 'source', 'address', 'title_abbreviation',
526 'title', 'transliterated_title'
527 ]
533