1 """nlmmedline_xml_format.py
2
3 A Martel format to parse the NLM's XML format for Medline.
4
5 http://www.nlm.nih.gov/databases/dtd/nlmmedline_031101.dtd
6 http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_031101.dtd
7 http://www.nlm.nih.gov/databases/dtd/nlmcommon_031101.dtd
8
9 Formats:
10 citation_format Format for one MedlineCitation.
11 format Format for a whole file.
12
13 """
14
15 import warnings
16 warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
17
18
19 import sys
20
21 from Martel import *
22 from Martel import RecordReader
23
24 self = sys.modules[__name__]
25
26 OSpaces = Opt(Spaces())
27
41
44
46 """simple_elem(element, *attrs)
47
48 Create a Martel Expression in this module's namespace that will
49 recognize an XML element in the form of:
50 <element>data</element>
51
52 The whole element must be on a single line. The Expression will
53 be created in the module's namespace with the same name as the
54 element.
55
56 """
57 start, end = _start_elem(element, *attrs), _end_elem(element)
58
59 group_name = element
60 group_expression = Re(r"[^<]+")
61 expr = start + \
62 Group(group_name, group_expression) + \
63 end + \
64 AnyEol()
65 setattr(self, element, expr)
66
67
68
69
70
72 start_name, end_name = "%s_start" % element, "%s_end" % element
73 start_expr = getattr(self, start_name, None)
74 if start_expr is None:
75 start_expr = _start_elem(element, *attrs) + AnyEol()
76 setattr(self, start_name, start_expr)
77 end_expr = getattr(self, end_name, None)
78 if end_expr is None:
79 end_expr = _end_elem(element) + AnyEol()
80 setattr(self, end_name, end_expr)
81
82 group_expr = start_expr + expr + end_expr
83 group_expr = Group(element, group_expr)
84 setattr(self, element, group_expr)
85
86
87
88
89
90
91
92
93
94 elements = [
95 "FirstName", "ForeName", "MiddleName", "LastName",
96 "Initials", "Suffix",
97 "CollectiveName"
98 ]
99 [simple_elem(e) for e in elements]
100 personal_name = LastName + \
101 Opt(Alt(ForeName, FirstName + Opt(MiddleName))) + \
102 Opt(Initials) + \
103 Opt(Suffix)
104 author_name = Alt(personal_name, CollectiveName)
105
106 imprint_type = Alt(Str("Current"), Str("Original"))
107 indexing_status = Alt(
108 Str("Ceased-publication"), Str("Continued-by-another-indexed-title"),
109 Str("Currently-indexed"), Str("Currently-indexed-Title-changed"),
110 Str("Date-range-of-indexed-citations-unspecified"),
111 Str("Deselected")
112 )
113
114
115
116
117 elements = [
118 "Year", "Month", "Day",
119 "Season", "MedlineDate",
120 "Hour", "Minute", "Second"
121 ]
122 [simple_elem(e) for e in elements]
123 normal_date = Year + Month + Day + \
124 Opt(Hour + Opt(Minute + Opt(Second)))
125 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate)
126
127
128 simple_elem("CopyrightInformation")
129 simple_elem("AbstractText")
130 group_elem("Abstract", AbstractText + Opt(CopyrightInformation))
131
132
133
134
135 simple_elem("NlmUniqueID")
136 simple_elem("PMID")
137 simple_elem("SubHeading", "MajorTopicYN")
138 simple_elem("QualifierName", "MajorTopicYN")
139 simple_elem("Descriptor", "MajorTopicYN")
140 simple_elem("DescriptorName", "MajorTopicYN")
141 group_elem("MeshHeading",
142 Alt(DescriptorName, Descriptor) + \
143 Alt(Rep(QualifierName), Rep(SubHeading)))
144 group_elem("MeshHeadingList", Rep1(MeshHeading))
145 simple_elem("MedlinePgn")
146 simple_elem("EndPage")
147 simple_elem("StartPage")
148 group_elem("Pagination",
149 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn))
150
151 simple_elem("Affiliation")
152 group_elem("Author", author_name + Opt(Affiliation))
153 group_elem("AuthorList", Rep1(Author), "CompleteYN")
154 simple_elem("Language")
155 simple_elem("PublicationType")
156 group_elem("PublicationTypeList", Rep1(PublicationType))
157 simple_elem("Title")
158 simple_elem("Volume")
159 simple_elem("VernacularTitle")
160 simple_elem("CollectionTitle")
161 simple_elem("ArticleTitle")
162 simple_elem("Publisher")
163 group_elem("PubDate", pub_date)
164 group_elem("Book", PubDate + Publisher + Title +
165 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume))
166 simple_elem("Country")
167 simple_elem("MedlineTA")
168 simple_elem("MedlineCode")
169 group_elem("MedlineJournalInfo",
170 Opt(Country) + MedlineTA + Opt(MedlineCode) + Opt(NlmUniqueID))
171 simple_elem("DateOfElectronicPublication")
172 simple_elem("ISOAbbreviation")
173 simple_elem("Coden")
174 simple_elem("Issue")
175 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate, "PrintYN")
176 simple_elem("ISSN")
177 group_elem("Journal",
178 Opt(ISSN) + \
179 JournalIssue + \
180 Opt(Coden) + \
181 Opt(Title) + \
182 Opt(ISOAbbreviation)
183 )
184
185 simple_elem("GrantID")
186 simple_elem("Acronym")
187 simple_elem("Agency")
188 group_elem("Grant", Opt(GrantID) + Opt(Acronym) + Opt(Agency))
189 group_elem("GrantList", Rep1(Grant), "CompleteYN")
190 simple_elem("AccessionNumber")
191 group_elem("AccessionNumberList", Rep1(AccessionNumber))
192 simple_elem("DataBankName")
193 group_elem("DataBank", DataBankName + Opt(AccessionNumberList))
194 group_elem("DataBankList", Rep1(DataBank), "CompleteYN")
195
196 group_elem("Article",
197 Alt(Journal, Book) + \
198 ArticleTitle + \
199 Pagination + \
200 Opt(Abstract) + \
201 Opt(Affiliation) + \
202 Opt(AuthorList) + \
203 Rep1(Language) + \
204 Opt(DataBankList) + \
205 Opt(GrantList) + \
206 PublicationTypeList + \
207 Opt(VernacularTitle) + \
208 Opt(DateOfElectronicPublication)
209 )
210 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo))
211
212
213
214
215
216
217
218
219
220
221
222
223 simple_elem("MedlineID")
224
225 simple_elem("Note")
226 simple_elem("RefSource")
227 Ref_template = RefSource + Opt(Alt(PMID, MedlineID)) + Opt(Note)
228
229
230
231
232
233 group_elem("OriginalReportIn", Ref_template)
234 group_elem("SummaryForPatientsIn", Ref_template)
235 group_elem("CommentOn", Ref_template)
236 group_elem("CommentIn", Ref_template)
237 group_elem("ErratumIn", Ref_template)
238 group_elem("ErratumFor", Ref_template)
239 group_elem("RepublishedFrom", Ref_template)
240 group_elem("RepublishedIn", Ref_template)
241 group_elem("RetractionOf", Ref_template)
242 group_elem("RetractionIn", Ref_template)
243 group_elem("UpdateIn", Ref_template)
244 group_elem("UpdateOf", Ref_template)
245 group_elem("CommentsCorrections",
246 Rep(CommentOn) + Rep(CommentIn) + \
247 Rep(ErratumIn) + Rep(ErratumFor) + \
248 Rep(RepublishedFrom) + Rep(RepublishedIn) + \
249 Rep(RetractionOf) + Rep(RetractionIn) + \
250 Rep(UpdateIn) + Rep(UpdateOf) + \
251 Rep(SummaryForPatientsIn) + Rep(OriginalReportIn)
252 )
253 simple_elem("NumberOfReferences")
254 group_elem("PersonalNameSubject", personal_name)
255 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject))
256 simple_elem("GeneSymbol")
257 group_elem("GeneSymbolList", Rep1(GeneSymbol))
258 simple_elem("NameOfSubstance")
259 simple_elem("RegistryNumber")
260 group_elem("Chemical", RegistryNumber + NameOfSubstance)
261 group_elem("ChemicalList", Rep1(Chemical))
262 simple_elem("CitationSubset")
263 simple_elem("GeneralNote", "Owner")
264 group_elem("Investigator", personal_name + Opt(Affiliation))
265 group_elem("InvestigatorList", Rep1(Investigator))
266 simple_elem("OtherID", "Source")
267 simple_elem("SpaceFlightMission")
268 simple_elem("Keyword", "MajorTopicYN")
269 group_elem("KeywordList", Rep1(Keyword), "Owner")
270 group_elem("OtherAbstract",
271 AbstractText + Opt(CopyrightInformation),
272 "Type")
273 group_elem("DateRevised", normal_date)
274 group_elem("DateCompleted", normal_date)
275 group_elem("DateCreated", normal_date)
276 group_elem("MedlineCitation",
277 Opt(MedlineID) + \
278 Opt(PMID) + \
279 DateCreated + \
280 Opt(DateCompleted) + \
281 Opt(DateRevised) + \
282 Article + \
283 MedlineJournalInfo + \
284 Opt(ChemicalList) + \
285 Rep(CitationSubset) + \
286 Opt(CommentsCorrections) + \
287 Opt(GeneSymbolList) + \
288 Opt(MeshHeadingList) + \
289 Opt(NumberOfReferences) + \
290 Opt(PersonalNameSubjectList) + \
291 Rep(OtherID) + \
292 Rep(OtherAbstract) + \
293 Rep(KeywordList) + \
294 Rep(SpaceFlightMission) + \
295 Opt(InvestigatorList) + \
296 Rep(GeneralNote),
297 "Owner", "Status"
298 )
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314 space = Any(" \t")
315 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol()
316 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol()
317
318
319
320 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol())
321
322
323 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID)))
324 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation))
325
326
327
328
329
330
331
332
333
334
335 group_elem("PubMedPubDate", \
336 OSpaces + Year + \
337 OSpaces + Month + \
338 OSpaces + Day + \
339 Opt(OSpaces + Hour) + \
340 Opt(OSpaces + Minute) + \
341 OSpaces,
342 "PubStatus")
343 group_elem("History", Rep(OSpaces + PubMedPubDate) + OSpaces)
344 simple_elem("PublicationStatus")
345 simple_elem("ArticleId", "IdType")
346 group_elem("ArticleIdList", Rep(OSpaces + ArticleId) + OSpaces)
347
348 group_elem("PubmedData",
349 OSpaces + History + \
350 OSpaces + PublicationStatus +
351 OSpaces + ArticleIdList)
352 group_elem("PubmedArticle", MedlineCitation + PubmedData)
353 group_elem("PubmedArticleSet", Rep(PubmedArticle + Rep(AnyEol())))
354
355 xml_version = Str('<?xml version="1.0"?>') + AnyEol()
356 doctype = Str('<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st November 2003//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_031101.dtd">') + AnyEol()
357
358 pubmed_query_format = xml_version + doctype + PubmedArticleSet
359
360
361
362
363
364
365
366
367
368
369
370 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol()
371
372 citation_format = MedlineCitation
373
374
375
376
377
378
379 header_format = Group("header", DOCTYPE + MedlineCitationSet_start)
380 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end
381 format = HeaderFooter(
382 "MedlineFile", {},
383
384
385
386
387
388
389
390
391 header_format, RecordReader.CountLines, (4,),
392 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",),
393 footer_format, RecordReader.Everything, (),
394 )
395