Package Bio :: Package Medline :: Module nlmmedline_031101_format
[hide private]
[frames] | no frames]

Source Code for Module Bio.Medline.nlmmedline_031101_format

  1  """nlmmedline_xml_format.py 
  2   
  3  A Martel format to parse the NLM's XML format for Medline. 
  4   
  5  http://www.nlm.nih.gov/databases/dtd/nlmmedline_031101.dtd 
  6  http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_031101.dtd 
  7  http://www.nlm.nih.gov/databases/dtd/nlmcommon_031101.dtd 
  8   
  9  Formats: 
 10  citation_format    Format for one MedlineCitation. 
 11  format             Format for a whole file. 
 12   
 13  """ 
 14   
 15  import warnings 
 16  warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 17   
 18   
 19  import sys 
 20   
 21  from Martel import * 
 22  from Martel import RecordReader 
 23   
 24  self = sys.modules[__name__] 
 25   
 26  OSpaces = Opt(Spaces()) 
 27   
28 -def _start_elem(element, *attrs):
29 if attrs: 30 attr_groups = [] 31 for attr in attrs: 32 group = Str(attr) + Str("=") + \ 33 Str('"') + Group(attr, Re(r'[^<&"]+')) + Str('"') 34 attr_groups.append(group) 35 start = Str("<") + Str(element) + \ 36 Rep(Str(" ") + Alt(*attr_groups)) + \ 37 Str(">") 38 else: 39 start = Str("<%s>" % element) 40 return start
41
42 -def _end_elem(element):
43 return Str("</%s>" % element)
44
45 -def simple_elem(element, *attrs):
46 """simple_elem(element, *attrs) 47 48 Create a Martel Expression in this module's namespace that will 49 recognize an XML element in the form of: 50 <element>data</element> 51 52 The whole element must be on a single line. The Expression will 53 be created in the module's namespace with the same name as the 54 element. 55 56 """ 57 start, end = _start_elem(element, *attrs), _end_elem(element) 58 59 group_name = element 60 group_expression = Re(r"[^<]+") 61 expr = start + \ 62 Group(group_name, group_expression) + \ 63 end + \ 64 AnyEol() 65 setattr(self, element, expr)
66 67 68 # Group expressions. A group consists of the start and end elements 69 # with an expression in-between. The Expression for the group will be 70 # called "NAME".
71 -def group_elem(element, expr, *attrs):
72 start_name, end_name = "%s_start" % element, "%s_end" % element 73 start_expr = getattr(self, start_name, None) 74 if start_expr is None: 75 start_expr = _start_elem(element, *attrs) + AnyEol() 76 setattr(self, start_name, start_expr) 77 end_expr = getattr(self, end_name, None) 78 if end_expr is None: 79 end_expr = _end_elem(element) + AnyEol() 80 setattr(self, end_name, end_expr) 81 82 group_expr = start_expr + expr + end_expr 83 group_expr = Group(element, group_expr) 84 setattr(self, element, group_expr)
85 86 87 ###################################################################### 88 # Implement Martel expressions that recognize: # 89 # http://www.nlm.nih.gov/databases/dtd/nlmcommon_011101.dtd # 90 ###################################################################### 91 92 ######################################## 93 # Personal and Author names 94 elements = [ 95 "FirstName", "ForeName", "MiddleName", "LastName", 96 "Initials", "Suffix", 97 "CollectiveName" 98 ] 99 [simple_elem(e) for e in elements] 100 personal_name = LastName + \ 101 Opt(Alt(ForeName, FirstName + Opt(MiddleName))) + \ 102 Opt(Initials) + \ 103 Opt(Suffix) 104 author_name = Alt(personal_name, CollectiveName) 105 106 imprint_type = Alt(Str("Current"), Str("Original")) 107 indexing_status = Alt( 108 Str("Ceased-publication"), Str("Continued-by-another-indexed-title"), 109 Str("Currently-indexed"), Str("Currently-indexed-Title-changed"), 110 Str("Date-range-of-indexed-citations-unspecified"), 111 Str("Deselected") 112 ) 113 114 115 ######################################## 116 # Dates 117 elements = [ 118 "Year", "Month", "Day", 119 "Season", "MedlineDate", 120 "Hour", "Minute", "Second" 121 ] 122 [simple_elem(e) for e in elements] 123 normal_date = Year + Month + Day + \ 124 Opt(Hour + Opt(Minute + Opt(Second))) 125 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate) 126 127 128 simple_elem("CopyrightInformation") 129 simple_elem("AbstractText") 130 group_elem("Abstract", AbstractText + Opt(CopyrightInformation)) 131 132 ######################################## 133 # NCBIArticle 134 135 simple_elem("NlmUniqueID") 136 simple_elem("PMID") 137 simple_elem("SubHeading", "MajorTopicYN") 138 simple_elem("QualifierName", "MajorTopicYN") 139 simple_elem("Descriptor", "MajorTopicYN") 140 simple_elem("DescriptorName", "MajorTopicYN") 141 group_elem("MeshHeading", 142 Alt(DescriptorName, Descriptor) + \ 143 Alt(Rep(QualifierName), Rep(SubHeading))) 144 group_elem("MeshHeadingList", Rep1(MeshHeading)) 145 simple_elem("MedlinePgn") 146 simple_elem("EndPage") 147 simple_elem("StartPage") 148 group_elem("Pagination", 149 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn)) 150 151 simple_elem("Affiliation") 152 group_elem("Author", author_name + Opt(Affiliation)) 153 group_elem("AuthorList", Rep1(Author), "CompleteYN") 154 simple_elem("Language") 155 simple_elem("PublicationType") 156 group_elem("PublicationTypeList", Rep1(PublicationType)) 157 simple_elem("Title") # These were moved up, so that the definitions 158 simple_elem("Volume") # will be before Book. 159 simple_elem("VernacularTitle") 160 simple_elem("CollectionTitle") 161 simple_elem("ArticleTitle") 162 simple_elem("Publisher") 163 group_elem("PubDate", pub_date) 164 group_elem("Book", PubDate + Publisher + Title + 165 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume)) 166 simple_elem("Country") 167 simple_elem("MedlineTA") 168 simple_elem("MedlineCode") 169 group_elem("MedlineJournalInfo", 170 Opt(Country) + MedlineTA + Opt(MedlineCode) + Opt(NlmUniqueID)) 171 simple_elem("DateOfElectronicPublication") 172 simple_elem("ISOAbbreviation") 173 simple_elem("Coden") 174 simple_elem("Issue") 175 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate, "PrintYN") 176 simple_elem("ISSN") 177 group_elem("Journal", 178 Opt(ISSN) + \ 179 JournalIssue + \ 180 Opt(Coden) + \ 181 Opt(Title) + \ 182 Opt(ISOAbbreviation) 183 ) 184 185 simple_elem("GrantID") 186 simple_elem("Acronym") 187 simple_elem("Agency") 188 group_elem("Grant", Opt(GrantID) + Opt(Acronym) + Opt(Agency)) 189 group_elem("GrantList", Rep1(Grant), "CompleteYN") 190 simple_elem("AccessionNumber") 191 group_elem("AccessionNumberList", Rep1(AccessionNumber)) 192 simple_elem("DataBankName") 193 group_elem("DataBank", DataBankName + Opt(AccessionNumberList)) 194 group_elem("DataBankList", Rep1(DataBank), "CompleteYN") 195 196 group_elem("Article", 197 Alt(Journal, Book) + \ 198 ArticleTitle + \ 199 Pagination + \ 200 Opt(Abstract) + \ 201 Opt(Affiliation) + \ 202 Opt(AuthorList) + \ 203 Rep1(Language) + \ 204 Opt(DataBankList) + \ 205 Opt(GrantList) + \ 206 PublicationTypeList + \ 207 Opt(VernacularTitle) + \ 208 Opt(DateOfElectronicPublication) 209 ) 210 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo)) 211 212 213 214 215 216 217 ###################################################################### 218 # Implement Martel expressions that recognize: # 219 # http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_011101.dtd # 220 ###################################################################### 221 222 223 simple_elem("MedlineID") 224 225 simple_elem("Note") 226 simple_elem("RefSource") 227 Ref_template = RefSource + Opt(Alt(PMID, MedlineID)) + Opt(Note) 228 229 230 ######################################## 231 # MedlineCitation 232 233 group_elem("OriginalReportIn", Ref_template) 234 group_elem("SummaryForPatientsIn", Ref_template) 235 group_elem("CommentOn", Ref_template) 236 group_elem("CommentIn", Ref_template) 237 group_elem("ErratumIn", Ref_template) 238 group_elem("ErratumFor", Ref_template) 239 group_elem("RepublishedFrom", Ref_template) 240 group_elem("RepublishedIn", Ref_template) 241 group_elem("RetractionOf", Ref_template) 242 group_elem("RetractionIn", Ref_template) 243 group_elem("UpdateIn", Ref_template) 244 group_elem("UpdateOf", Ref_template) 245 group_elem("CommentsCorrections", 246 Rep(CommentOn) + Rep(CommentIn) + \ 247 Rep(ErratumIn) + Rep(ErratumFor) + \ 248 Rep(RepublishedFrom) + Rep(RepublishedIn) + \ 249 Rep(RetractionOf) + Rep(RetractionIn) + \ 250 Rep(UpdateIn) + Rep(UpdateOf) + \ 251 Rep(SummaryForPatientsIn) + Rep(OriginalReportIn) 252 ) 253 simple_elem("NumberOfReferences") 254 group_elem("PersonalNameSubject", personal_name) 255 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject)) 256 simple_elem("GeneSymbol") 257 group_elem("GeneSymbolList", Rep1(GeneSymbol)) 258 simple_elem("NameOfSubstance") 259 simple_elem("RegistryNumber") 260 group_elem("Chemical", RegistryNumber + NameOfSubstance) 261 group_elem("ChemicalList", Rep1(Chemical)) 262 simple_elem("CitationSubset") 263 simple_elem("GeneralNote", "Owner") 264 group_elem("Investigator", personal_name + Opt(Affiliation)) 265 group_elem("InvestigatorList", Rep1(Investigator)) 266 simple_elem("OtherID", "Source") 267 simple_elem("SpaceFlightMission") 268 simple_elem("Keyword", "MajorTopicYN") 269 group_elem("KeywordList", Rep1(Keyword), "Owner") 270 group_elem("OtherAbstract", 271 AbstractText + Opt(CopyrightInformation), 272 "Type") 273 group_elem("DateRevised", normal_date) 274 group_elem("DateCompleted", normal_date) 275 group_elem("DateCreated", normal_date) 276 group_elem("MedlineCitation", 277 Opt(MedlineID) + \ 278 Opt(PMID) + \ 279 DateCreated + \ 280 Opt(DateCompleted) + \ 281 Opt(DateRevised) + \ 282 Article + \ 283 MedlineJournalInfo + \ 284 Opt(ChemicalList) + \ 285 Rep(CitationSubset) + \ 286 Opt(CommentsCorrections) + \ 287 Opt(GeneSymbolList) + \ 288 Opt(MeshHeadingList) + \ 289 Opt(NumberOfReferences) + \ 290 Opt(PersonalNameSubjectList) + \ 291 Rep(OtherID) + \ 292 Rep(OtherAbstract) + \ 293 Rep(KeywordList) + \ 294 Rep(SpaceFlightMission) + \ 295 Opt(InvestigatorList) + \ 296 Rep(GeneralNote), 297 "Owner", "Status" 298 ) 299 300 301 302 303 304 305 ###################################################################### 306 # Implement Martel expressions that recognize: # 307 # http://www.nlm.nih.gov/databases/dtd/nlmmedline_011101.dtd # 308 ###################################################################### 309 310 311 312 # The DeleteCitation tags start with spaces, so I have to make a 313 # special case for it. 314 space = Any(" \t") 315 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol() 316 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol() 317 318 # The file doesn't always end in a newline, so make MedlineCitationSet 319 # end in an optional Eol. 320 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol()) 321 322 323 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID))) 324 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation)) 325 326 327 328 329 330 ###################################################################### 331 # For Pubmed queries # 332 # # 333 ###################################################################### 334 335 group_elem("PubMedPubDate", \ 336 OSpaces + Year + \ 337 OSpaces + Month + \ 338 OSpaces + Day + \ 339 Opt(OSpaces + Hour) + \ 340 Opt(OSpaces + Minute) + \ 341 OSpaces, 342 "PubStatus") 343 group_elem("History", Rep(OSpaces + PubMedPubDate) + OSpaces) 344 simple_elem("PublicationStatus") 345 simple_elem("ArticleId", "IdType") 346 group_elem("ArticleIdList", Rep(OSpaces + ArticleId) + OSpaces) 347 348 group_elem("PubmedData", 349 OSpaces + History + \ 350 OSpaces + PublicationStatus + 351 OSpaces + ArticleIdList) 352 group_elem("PubmedArticle", MedlineCitation + PubmedData) 353 group_elem("PubmedArticleSet", Rep(PubmedArticle + Rep(AnyEol()))) 354 355 xml_version = Str('<?xml version="1.0"?>') + AnyEol() 356 doctype = Str('<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st November 2003//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_031101.dtd">') + AnyEol() 357 358 pubmed_query_format = xml_version + doctype + PubmedArticleSet 359 360 361 362 363 ###################################################################### 364 # Other stuff # 365 # # 366 ###################################################################### 367 368 369 # Should match the proper dtd in here... 370 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol() 371 372 citation_format = MedlineCitation 373 374 # I'm going to use a RecordReader so that I can parse one record at a 375 # time, instead of sucking the whole XML file into memory. Each 376 # citation is going to be a record. Thus, the header is everything 377 # before the first citation and the footer is everything after the 378 # last citation. 379 header_format = Group("header", DOCTYPE + MedlineCitationSet_start) 380 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end 381 format = HeaderFooter( 382 "MedlineFile", {}, 383 # Unfortunately, RecordReader.Until doesn't work because some 384 # MedlineCitations have attributes in the form 385 # <MedlineCitation Owner="NLM">. "<MedlineCitation" by itself 386 # won't differentiate between the beginning of a 387 # MedlineCitationSet or the beginning of a MedlineCitation. Thus, 388 # I'm just going to read the first 4 lines and hope that's the 389 # whole header. 390 #header_format, RecordReader.Until, ("<MedlineCitation>",), 391 header_format, RecordReader.CountLines, (4,), 392 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",), 393 footer_format, RecordReader.Everything, (), 394 ) 395