1 """
2 A Martel format to parse the NLM's XML format for Medline.
3
4 http://www.nlm.nih.gov/databases/dtd/nlmmedline_001211.dtd
5 http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_001211.dtd
6 http://www.nlm.nih.gov/databases/dtd/nlmcommon_001211.dtd
7
8 Formats:
9 citation_format Format for one MedlineCitation.
10 format Format for a whole file.
11
12 """
13
14 import warnings
15 warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
16
17
18 import sys
19
20 from Martel import *
21 from Martel import RecordReader
22
23 self = sys.modules[__name__]
24
38
41
43 """simple_elem(element, *attrs)
44
45 Create a Martel Expression in this module's namespace that will
46 recognize an XML element in the form of:
47 <element>data</element>
48
49 The whole element must be on a single line. The Expression will
50 be created in the module's namespace with the same name as the
51 element.
52
53 """
54 start, end = _start_elem(element, *attrs), _end_elem(element)
55
56 group_name = element
57 group_expression = Re(r"[^<]+")
58 expr = start + \
59 Group(group_name, group_expression) + \
60 end + \
61 AnyEol()
62 setattr(self, element, expr)
63
64
65
66
67
69 start_name, end_name = "%s_start" % element, "%s_end" % element
70 start_expr = getattr(self, start_name, None)
71 if start_expr is None:
72 start_expr = _start_elem(element, *attrs) + AnyEol()
73 setattr(self, start_name, start_expr)
74 end_expr = getattr(self, end_name, None)
75 if end_expr is None:
76 end_expr = _end_elem(element) + AnyEol()
77 setattr(self, end_name, end_expr)
78
79 group_expr = start_expr + expr + end_expr
80 group_expr = Group(element, group_expr)
81 setattr(self, element, group_expr)
82
83
84
85
86
87
88
89
90
91 elements = [
92 "FirstName", "MiddleName", "LastName",
93 "Initials", "Suffix",
94 "CollectiveName"
95 ]
96 [simple_elem(e) for e in elements]
97 personal_name = LastName + \
98 Opt(FirstName + Opt(MiddleName)) + \
99 Opt(Initials) + \
100 Opt(Suffix)
101 author_name = Alt(personal_name, CollectiveName)
102
103
104
105
106 elements = [
107 "Year", "Month", "Day",
108 "Season", "MedlineDate",
109 "Hour", "Minute", "Second"
110 ]
111 [simple_elem(e) for e in elements]
112 normal_date = Year + Month + Day + \
113 Opt(Hour + Opt(Minute + Opt(Second)))
114 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate)
115
116
117
118
119
120 simple_elem("NlmUniqueID")
121 simple_elem("PMID")
122 simple_elem("SubHeading", "MajorTopicYN")
123 simple_elem("Descriptor", "MajorTopicYN")
124 group_elem("MeshHeading", Descriptor + Rep(SubHeading))
125 group_elem("MeshHeadingList", Rep1(MeshHeading))
126 simple_elem("MedlinePgn")
127 simple_elem("EndPage")
128 simple_elem("StartPage")
129 group_elem("Pagination",
130 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn))
131 simple_elem("CopyrightInformation")
132 simple_elem("AbstractText")
133 group_elem("Abstract", AbstractText + Opt(CopyrightInformation))
134
135 simple_elem("Affiliation")
136 group_elem("Author", author_name + Opt(Affiliation))
137 group_elem("AuthorList", Rep1(Author), "CompleteYN")
138 simple_elem("Language")
139 simple_elem("PublicationType")
140 group_elem("PublicationTypeList", Rep1(PublicationType))
141 simple_elem("Title")
142 simple_elem("Volume")
143 simple_elem("VernacularTitle")
144 simple_elem("CollectionTitle")
145 simple_elem("ArticleTitle")
146 simple_elem("Publisher")
147 group_elem("PubDate", pub_date)
148 group_elem("Book", PubDate + Publisher + Title +
149 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume))
150 simple_elem("Country")
151 simple_elem("MedlineTA")
152 simple_elem("MedlineCode")
153 group_elem("MedlineJournalInfo",
154 Country + MedlineTA + MedlineCode + Opt(NlmUniqueID))
155 simple_elem("DateOfElectronicPublication")
156 simple_elem("ISOAbbreviation")
157 simple_elem("Coden")
158 simple_elem("Issue")
159 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate)
160 simple_elem("ISSN")
161 group_elem("Journal",
162 Opt(ISSN) + \
163 JournalIssue + \
164 Opt(Coden) + \
165 Opt(Title) + \
166 Opt(ISOAbbreviation)
167 )
168
169 simple_elem("GrantID")
170 simple_elem("Acronym")
171 simple_elem("Agency")
172 group_elem("Grant", GrantID + Opt(Acronym) + Opt(Agency))
173 group_elem("GrantList", Rep1(Grant), "CompleteYN")
174 simple_elem("AccessionNumber")
175 group_elem("AccessionNumberList", Rep1(AccessionNumber))
176 simple_elem("DataBankName")
177 group_elem("DataBank", DataBankName + Opt(AccessionNumberList))
178 group_elem("DataBankList", Rep1(DataBank), "CompleteYN")
179
180 group_elem("Article",
181 Alt(Journal, Book) + \
182 ArticleTitle + \
183 Pagination + \
184 Opt(Abstract) + \
185 Opt(Affiliation) + \
186 Opt(AuthorList) + \
187 Rep1(Language) + \
188 Opt(DataBankList) + \
189 Opt(GrantList) + \
190 PublicationTypeList + \
191 Opt(VernacularTitle) + \
192 Opt(DateOfElectronicPublication)
193 )
194 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo))
195
196
197
198
199
200
201
202
203
204
205
206
207 simple_elem("MedlineID")
208
209 simple_elem("Note")
210 simple_elem("RefSource")
211 Ref_template = RefSource + Opt(MedlineID) + Opt(Note)
212
213
214
215
216
217 group_elem("CommentOn", Ref_template)
218 group_elem("CommentIn", Ref_template)
219 group_elem("ErratumIn", Ref_template)
220 group_elem("RepublishedFrom", Ref_template)
221 group_elem("RepublishedIn", Ref_template)
222 group_elem("RetractionOf", Ref_template)
223 group_elem("RetractionIn", Ref_template)
224 group_elem("UpdateIn", Ref_template)
225 group_elem("UpdateOf", Ref_template)
226 group_elem("CommentsCorrections",
227 Rep(CommentOn) + Rep(CommentIn) + \
228 Rep(ErratumIn) + \
229 Rep(RepublishedFrom) + Rep(RepublishedIn) + \
230 Rep(RetractionOf) + Rep(RetractionIn) + \
231 Rep(UpdateIn) + Rep(UpdateOf)
232 )
233 simple_elem("NumberOfReferences")
234 group_elem("PersonalNameSubject", personal_name)
235 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject))
236 simple_elem("GeneSymbol")
237 group_elem("GeneSymbolList", Rep1(GeneSymbol))
238 simple_elem("NameOfSubstance")
239 simple_elem("CASRegistryNumber")
240 group_elem("Chemical", CASRegistryNumber + NameOfSubstance)
241 group_elem("ChemicalList", Rep1(Chemical))
242 simple_elem("CitationSubset")
243 simple_elem("SpaceFlightMission")
244 simple_elem("SponsoringAgency")
245 simple_elem("ProcurementSource")
246 simple_elem("Keyword")
247 simple_elem("AbstractAuthor")
248 group_elem("OtherAbstract", Abstract + AbstractAuthor)
249 group_elem("AdditionalInformation",
250 Rep(OtherAbstract) + \
251 Rep(Keyword) + \
252 Rep(ProcurementSource) + \
253 Rep(SponsoringAgency) + \
254 Rep(SpaceFlightMission))
255 group_elem("DateRevised", normal_date)
256 group_elem("DateCompleted", normal_date)
257 group_elem("DateCreated", normal_date)
258 group_elem("MedlineCitation",
259 MedlineID + \
260 Opt(PMID) + \
261 DateCreated + \
262 Opt(DateCompleted) + \
263 Opt(DateRevised) + \
264 Article + \
265 MedlineJournalInfo + \
266 Opt(AdditionalInformation) + \
267 Opt(ChemicalList) + \
268 Rep(CitationSubset) + \
269 Opt(CommentsCorrections) + \
270 Opt(GeneSymbolList) + \
271 Opt(MeshHeadingList) + \
272 Opt(NumberOfReferences) + \
273 Opt(PersonalNameSubjectList),
274 "CitationOwner"
275 )
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291 space = Any(" \t")
292 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol()
293 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol()
294
295
296
297 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol())
298
299
300 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID)))
301 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation))
302
303
304
305
306
307
308
309
310
311
312
313
314 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol()
315
316 citation_format = MedlineCitation
317
318
319
320
321
322
323 header_format = Group("header", DOCTYPE + MedlineCitationSet_start)
324 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end
325 format = HeaderFooter(
326 None, {},
327 header_format, RecordReader.Until, ("<MedlineCitation>",),
328 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",),
329 footer_format, RecordReader.Everything, (),
330 )
331