1 """nlmmedline_xml_format.py
2
3 A Martel format to parse the NLM's XML format for Medline.
4
5 http://www.nlm.nih.gov/databases/dtd/nlmmedline_011101.dtd
6 http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_011101.dtd
7 http://www.nlm.nih.gov/databases/dtd/nlmcommon_011101.dtd
8
9 Formats:
10 citation_format Format for one MedlineCitation.
11 format Format for a whole file.
12
13 """
14
15 import warnings
16 warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
17
18 import sys
19
20 from Martel import *
21 from Martel import RecordReader
22
23 self = sys.modules[__name__]
24
38
41
43 """simple_elem(element, *attrs)
44
45 Create a Martel Expression in this module's namespace that will
46 recognize an XML element in the form of:
47 <element>data</element>
48
49 The whole element must be on a single line. The Expression will
50 be created in the module's namespace with the same name as the
51 element.
52
53 """
54 start, end = _start_elem(element, *attrs), _end_elem(element)
55
56 group_name = element
57 group_expression = Re(r"[^<]+")
58 expr = start + \
59 Group(group_name, group_expression) + \
60 end + \
61 AnyEol()
62 setattr(self, element, expr)
63
64
65
66
67
69 start_name, end_name = "%s_start" % element, "%s_end" % element
70 start_expr = getattr(self, start_name, None)
71 if start_expr is None:
72 start_expr = _start_elem(element, *attrs) + AnyEol()
73 setattr(self, start_name, start_expr)
74 end_expr = getattr(self, end_name, None)
75 if end_expr is None:
76 end_expr = _end_elem(element) + AnyEol()
77 setattr(self, end_name, end_expr)
78
79 group_expr = start_expr + expr + end_expr
80 group_expr = Group(element, group_expr)
81 setattr(self, element, group_expr)
82
83
84
85
86
87
88
89
90
91 elements = [
92 "FirstName", "ForeName", "MiddleName", "LastName",
93 "Initials", "Suffix",
94 "CollectiveName"
95 ]
96 [simple_elem(e) for e in elements]
97 personal_name = LastName + \
98 Opt(Alt(ForeName, FirstName + Opt(MiddleName))) + \
99 Opt(Initials) + \
100 Opt(Suffix)
101 author_name = Alt(personal_name, CollectiveName)
102
103
104
105
106 elements = [
107 "Year", "Month", "Day",
108 "Season", "MedlineDate",
109 "Hour", "Minute", "Second"
110 ]
111 [simple_elem(e) for e in elements]
112 normal_date = Year + Month + Day + \
113 Opt(Hour + Opt(Minute + Opt(Second)))
114 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate)
115
116
117 simple_elem("CopyrightInformation")
118 simple_elem("AbstractText")
119 group_elem("Abstract", AbstractText + Opt(CopyrightInformation))
120
121
122
123
124 simple_elem("NlmUniqueID")
125 simple_elem("PMID")
126 simple_elem("SubHeading", "MajorTopicYN")
127 simple_elem("QualifierName", "MajorTopicYN")
128 simple_elem("Descriptor", "MajorTopicYN")
129 simple_elem("DescriptorName", "MajorTopicYN")
130 group_elem("MeshHeading",
131 Alt(DescriptorName, Descriptor) + \
132 Alt(Rep(QualifierName), Rep(SubHeading)))
133 group_elem("MeshHeadingList", Rep1(MeshHeading))
134 simple_elem("MedlinePgn")
135 simple_elem("EndPage")
136 simple_elem("StartPage")
137 group_elem("Pagination",
138 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn))
139
140 simple_elem("Affiliation")
141 group_elem("Author", author_name + Opt(Affiliation))
142 group_elem("AuthorList", Rep1(Author), "CompleteYN")
143 simple_elem("Language")
144 simple_elem("PublicationType")
145 group_elem("PublicationTypeList", Rep1(PublicationType))
146 simple_elem("Title")
147 simple_elem("Volume")
148 simple_elem("VernacularTitle")
149 simple_elem("CollectionTitle")
150 simple_elem("ArticleTitle")
151 simple_elem("Publisher")
152 group_elem("PubDate", pub_date)
153 group_elem("Book", PubDate + Publisher + Title +
154 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume))
155 simple_elem("Country")
156 simple_elem("MedlineTA")
157 simple_elem("MedlineCode")
158 group_elem("MedlineJournalInfo",
159 Opt(Country) + MedlineTA + Opt(MedlineCode) + Opt(NlmUniqueID))
160 simple_elem("DateOfElectronicPublication")
161 simple_elem("ISOAbbreviation")
162 simple_elem("Coden")
163 simple_elem("Issue")
164 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate)
165 simple_elem("ISSN")
166 group_elem("Journal",
167 Opt(ISSN) + \
168 JournalIssue + \
169 Opt(Coden) + \
170 Opt(Title) + \
171 Opt(ISOAbbreviation)
172 )
173
174 simple_elem("GrantID")
175 simple_elem("Acronym")
176 simple_elem("Agency")
177 group_elem("Grant", Opt(GrantID) + Opt(Acronym) + Opt(Agency))
178 group_elem("GrantList", Rep1(Grant), "CompleteYN")
179 simple_elem("AccessionNumber")
180 group_elem("AccessionNumberList", Rep1(AccessionNumber))
181 simple_elem("DataBankName")
182 group_elem("DataBank", DataBankName + Opt(AccessionNumberList))
183 group_elem("DataBankList", Rep1(DataBank), "CompleteYN")
184
185 group_elem("Article",
186 Alt(Journal, Book) + \
187 ArticleTitle + \
188 Pagination + \
189 Opt(Abstract) + \
190 Opt(Affiliation) + \
191 Opt(AuthorList) + \
192 Rep1(Language) + \
193 Opt(DataBankList) + \
194 Opt(GrantList) + \
195 PublicationTypeList + \
196 Opt(VernacularTitle) + \
197 Opt(DateOfElectronicPublication)
198 )
199 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo))
200
201
202
203
204
205
206
207
208
209
210
211
212 simple_elem("MedlineID")
213
214 simple_elem("Note")
215 simple_elem("RefSource")
216 Ref_template = RefSource + Opt(Alt(PMID, MedlineID)) + Opt(Note)
217
218
219
220
221
222 group_elem("OriginalReportIn", Ref_template)
223 group_elem("SummaryForPatientsIn", Ref_template)
224 group_elem("CommentOn", Ref_template)
225 group_elem("CommentIn", Ref_template)
226 group_elem("ErratumIn", Ref_template)
227 group_elem("RepublishedFrom", Ref_template)
228 group_elem("RepublishedIn", Ref_template)
229 group_elem("RetractionOf", Ref_template)
230 group_elem("RetractionIn", Ref_template)
231 group_elem("UpdateIn", Ref_template)
232 group_elem("UpdateOf", Ref_template)
233 group_elem("CommentsCorrections",
234 Rep(CommentOn) + Rep(CommentIn) + \
235 Rep(ErratumIn) + \
236 Rep(RepublishedFrom) + Rep(RepublishedIn) + \
237 Rep(RetractionOf) + Rep(RetractionIn) + \
238 Rep(UpdateIn) + Rep(UpdateOf) + \
239 Rep(SummaryForPatientsIn) + Rep(OriginalReportIn)
240 )
241 simple_elem("NumberOfReferences")
242 group_elem("PersonalNameSubject", personal_name)
243 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject))
244 simple_elem("GeneSymbol")
245 group_elem("GeneSymbolList", Rep1(GeneSymbol))
246 simple_elem("NameOfSubstance")
247 simple_elem("CASRegistryNumber")
248 simple_elem("RegistryNumber")
249 group_elem("Chemical", Alt(CASRegistryNumber, RegistryNumber) + \
250 NameOfSubstance)
251 group_elem("ChemicalList", Rep1(Chemical))
252 simple_elem("CitationSubset")
253 simple_elem("GeneralNote", "Owner")
254 group_elem("Investigator", personal_name + Opt(Affiliation))
255 group_elem("InvestigatorList", Rep1(Investigator))
256 simple_elem("OtherID", "Source")
257 simple_elem("SpaceFlightMission")
258 simple_elem("Keyword", "MajorTopicYN")
259 group_elem("KeywordList", Rep1(Keyword), "Owner")
260 group_elem("OtherAbstract",
261 AbstractText + Opt(CopyrightInformation),
262 "Type")
263 group_elem("DateRevised", normal_date)
264 group_elem("DateCompleted", normal_date)
265 group_elem("DateCreated", normal_date)
266 group_elem("MedlineCitation",
267 Opt(MedlineID) + \
268 Opt(PMID) + \
269 DateCreated + \
270 Opt(DateCompleted) + \
271 Opt(DateRevised) + \
272 Article + \
273 MedlineJournalInfo + \
274 Opt(ChemicalList) + \
275 Rep(CitationSubset) + \
276 Opt(CommentsCorrections) + \
277 Opt(GeneSymbolList) + \
278 Opt(MeshHeadingList) + \
279 Opt(NumberOfReferences) + \
280 Opt(PersonalNameSubjectList) + \
281 Rep(OtherID) + \
282 Rep(OtherAbstract) + \
283 Rep(KeywordList) + \
284 Rep(SpaceFlightMission) + \
285 Opt(InvestigatorList) + \
286 Rep(GeneralNote),
287 "Owner", "Status"
288 )
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304 space = Any(" \t")
305 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol()
306 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol()
307
308
309
310 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol())
311
312
313 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID)))
314 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation))
315
316
317
318
319
320
321
322
323
324
325
326
327 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol()
328
329 citation_format = MedlineCitation
330
331
332
333
334
335
336 header_format = Group("header", DOCTYPE + MedlineCitationSet_start)
337 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end
338 format = HeaderFooter(
339 None, {},
340
341
342
343
344
345
346
347
348 header_format, RecordReader.CountLines, (4,),
349 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",),
350 footer_format, RecordReader.Everything, (),
351 )
352