1
2
3
4
5
6 """
7 This module provides code to work with Medline.
8
9 Classes:
10 Record Holds Medline data.
11 Iterator Iterates over a file containing Medline records.
12 RecordParser Parses a Medline record into a Record object.
13
14 _Scanner Scans a Medline record.
15 _RecordConsumer Consumes Medline data to a Record object.
16
17 """
18
19 from Bio import File
20 from Bio.ParserSupport import *
21
23 """Holds information from a Medline record.
24
25 Members:
26 id Medline ID.
27 pubmed_id Pubmed ID.
28
29 mesh_headings List of MeSH headings.
30 mesh_tree_numbers List of MeSH Tree Numbers.
31 mesh_subheadings List of MeSH subheadings.
32
33 abstract The abstract.
34 comments List of references to comments.
35 abstract_author The author of the abstract.
36 english_abstract "A" if a foreign article has an english abstract.
37
38 source Bibliographic information.
39 publication_types List of type of publication.
40 number_of_references Number of bibliographic references, for REVIEW pubs.
41
42 authors List of authors.
43 no_author "A" for anonymous.
44 address Address of the first author.
45
46 journal_title_code Three-character code assigned to the journal.
47 title_abbreviation Abbreviation of journal title.
48 issn International Standard Serial Number.
49 journal_subsets List of strings that describe journal groupings.
50 country Country of publication
51 languages List of languages of the article.
52
53 title Article title.
54 transliterated_title Title in the original language.
55 call_number The call number of the journal issue.
56 issue_part_supplement Issue, part, or supplement of journal published.
57 volume_issue Volume number of journal.
58 publication_date Date published (string).
59 year Year published (string).
60 pagination Inclusive pages of an indexed item.
61
62 special_list Coding for the database of the citation.
63
64 substance_name Preferred name for a chemical or drug.
65 gene_symbols List of abbreviated gene names.
66 secondary_source_ids List of source databanks and accessions.
67 identifications List of research grant or contract numbers.
68 registry_numbers List of CAS or EC numbers.
69
70 personal_name_as_subjects List of individuals who are subjects.
71
72 record_originators List of people who worked on record.
73 entry_date Date record made machine readable (YYMMDD).
74 entry_month YYMM entered into Medline.
75 class_update_date Date touched by Class Maintenance action (string).
76 last_revision_date Date for minor revision.
77 major_revision_date Date for major revision.
78
79 undefined List of lines that don't match the standard.
80
81 """
137
139 """Returns one record at a time from a file of Medline records.
140
141 Methods:
142 next Return the next record from the stream, or None.
143
144 """
145 - def __init__(self, handle, parser=None):
146 """__init__(self, handle, parser=None)
147
148 Create a new iterator. handle is a file-like object. parser
149 is an optional Parser object to change the results into another form.
150 If set to None, then the raw contents of the file will be returned.
151
152 """
153 self._handle = handle
154 self._parser = parser
155
158
160 """next(self) -> object
161
162 Return the next medline record from the file. If no more records,
163 return None.
164
165 """
166 lines = []
167 for line in self.handle:
168 lines.append(line)
169 if line.strip()=='':
170 break
171 else:
172 raise StopIteration
173
174 data = ''.join(lines)
175
176 if self._parser is not None:
177 return self._parser.parse_str(data)
178 return data
179
181 """Parses Medline data into a Record object.
182
183 """
187
188 - def parse(self, handle):
189 self._scanner.feed(handle, self._consumer)
190 return self._consumer.data
191
193 """Scans a Medline record.
194
195 """
196
197 _categories = {
198 "AA" : "abstract_author",
199 "AB" : "abstract",
200 "AD" : "address",
201 "AU" : "author",
202 "CA" : "call_number",
203 "CM" : "comments",
204 "CU" : "class_update_date",
205 "CY" : "country",
206 "DA" : "entry_date",
207 "DP" : "publication_date",
208 "EA" : "english_abstract",
209 "EM" : "entry_month",
210 "GS" : "gene_symbol",
211 "ID" : "identification",
212 "IP" : "issue_part_supplement",
213 "IS" : "issn",
214 "JC" : "journal_title_code",
215 "LA" : "language",
216 "LI" : "special_list",
217 "LR" : "last_revision_date",
218 "MH" : "mesh_heading",
219 "MN" : "mesh_tree_number",
220 "MR" : "major_revision_date",
221 "NI" : "no_author",
222 "NM" : "substance_name",
223 "PG" : "pagination",
224 "PS" : "personal_name_as_subject",
225 "PT" : "publication_type",
226 "RF" : "number_of_references",
227 "RN" : "cas_registry_number",
228 "RO" : "record_originator",
229 "SB" : "journal_subset",
230 "SH" : "subheadings",
231 "SI" : "secondary_source_id",
232 "SO" : "source",
233 "TA" : "title_abbreviation",
234 "TI" : "title",
235 "TT" : "transliterated_title",
236 "UI" : "unique_identifier",
237 "VI" : "volume_issue",
238 "YR" : "year",
239
240
241 "PMID" : "pubmed_id",
242 }
243
244 - def feed(self, handle, consumer):
261
263 consumer.start_record()
264
265 prev_qualifier = None
266 while 1:
267 line = uhandle.readline()
268 if is_blank_line(line):
269 break
270
271
272
273
274
275
276
277
278 qualifier = line[:4].rstrip()
279
280
281
282
283 if line[0] == '\t' or qualifier == '' or \
284 line[:13] == ' purification':
285 if prev_qualifier is None:
286 raise ValueError, "Continuation on first line\n%s" % line
287 qualifier = prev_qualifier
288 else:
289
290 if len(line) < 5 or line[4] != '-':
291 raise ValueError, \
292 "I don't understand the format of line %s" % line
293 prev_qualifier = qualifier
294
295 try:
296 fn = getattr(consumer, self._categories[qualifier])
297 except KeyError:
298
299 consumer.undefined(line)
300 else:
301 fn(line)
302
303 consumer.end_record()
304
306 """Consumer that converts a Medline record to a Record object.
307
308 Members:
309 data Record with Medline data.
310
311 """
314
317
320
323
326
329
332
336
339
344
348
349 - def entry_date(self, line):
350 assert not self.data.entry_date, "entry date already defined"
351 self.data.entry_date = self._clean(line)
352
357
362
363 - def entry_month(self, line):
364 assert not self.data.entry_month, \
365 "entry month already defined"
366 self.data.entry_month = self._clean(line)
367
370
373
378
379 - def issn(self, line):
382
387
390
394
399
401
402
403
404
405 if line[:2] == 'MH':
406 self.data.mesh_headings.append(self._clean(line))
407 else:
408 prev_mh = self.data.mesh_headings.pop()
409 continued_mh = self._clean(line)
410 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
411
414
419
423
427
429 assert not self.data.pagination, "pagination already defined"
430 self.data.pagination = self._clean(line)
431
434
437
442
445
448
451
454
457
460
464
467
471
475
479
480 - def year(self, line):
483
487
497
498 - def _clean(self, line, rstrip=1):
499 tab = line.find('\t')
500 if tab >= 0:
501 nospace = line[tab+1:]
502 elif line[:13] == ' purification':
503 nospace = line[1:]
504 else:
505 nospace = line[6:]
506 if rstrip:
507 return nospace.rstrip()
508 return nospace
509
510 _needs_stripping = [
511 'abstract', 'source', 'address', 'title_abbreviation',
512 'title', 'transliterated_title'
513 ]
519