Package Bio :: Package Medline :: Module nlmmedline_001211_format
[hide private]
[frames] | no frames]

Source Code for Module Bio.Medline.nlmmedline_001211_format

  1  """ 
  2  A Martel format to parse the NLM's XML format for Medline. 
  3   
  4  http://www.nlm.nih.gov/databases/dtd/nlmmedline_001211.dtd 
  5  http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_001211.dtd 
  6  http://www.nlm.nih.gov/databases/dtd/nlmcommon_001211.dtd 
  7   
  8  Formats: 
  9  citation_format    Format for one MedlineCitation. 
 10  format             Format for a whole file. 
 11   
 12  """ 
 13   
 14  import warnings 
 15  warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 16   
 17   
 18  import sys 
 19   
 20  from Martel import * 
 21  from Martel import RecordReader 
 22   
 23  self = sys.modules[__name__] 
 24   
25 -def _start_elem(element, *attrs):
26 if attrs: 27 attr_groups = [] 28 for attr in attrs: 29 group = Str(attr) + Str("=") + \ 30 Str('"') + Group(attr, Re(r'[^<&"]+')) + Str('"') 31 attr_groups.append(group) 32 start = Str("<") + Str(element) + \ 33 Rep(Str(" ") + Alt(*attr_groups)) + \ 34 Str(">") 35 else: 36 start = Str("<%s>" % element) 37 return start
38
39 -def _end_elem(element):
40 return Str("</%s>" % element)
41
42 -def simple_elem(element, *attrs):
43 """simple_elem(element, *attrs) 44 45 Create a Martel Expression in this module's namespace that will 46 recognize an XML element in the form of: 47 <element>data</element> 48 49 The whole element must be on a single line. The Expression will 50 be created in the module's namespace with the same name as the 51 element. 52 53 """ 54 start, end = _start_elem(element, *attrs), _end_elem(element) 55 56 group_name = element 57 group_expression = Re(r"[^<]+") 58 expr = start + \ 59 Group(group_name, group_expression) + \ 60 end + \ 61 AnyEol() 62 setattr(self, element, expr)
63 64 65 # Group expressions. A group consists of the start and end elements 66 # with an expression in-between. The Expression for the group will be 67 # called "NAME".
68 -def group_elem(element, expr, *attrs):
69 start_name, end_name = "%s_start" % element, "%s_end" % element 70 start_expr = getattr(self, start_name, None) 71 if start_expr is None: 72 start_expr = _start_elem(element, *attrs) + AnyEol() 73 setattr(self, start_name, start_expr) 74 end_expr = getattr(self, end_name, None) 75 if end_expr is None: 76 end_expr = _end_elem(element) + AnyEol() 77 setattr(self, end_name, end_expr) 78 79 group_expr = start_expr + expr + end_expr 80 group_expr = Group(element, group_expr) 81 setattr(self, element, group_expr)
82 83 84 ###################################################################### 85 # Implement Martel expressions that recognize: # 86 # http://www.nlm.nih.gov/databases/dtd/nlmcommon_001211.dtd # 87 ###################################################################### 88 89 ######################################## 90 # Personal and Author names 91 elements = [ 92 "FirstName", "MiddleName", "LastName", 93 "Initials", "Suffix", 94 "CollectiveName" 95 ] 96 [simple_elem(e) for e in elements] 97 personal_name = LastName + \ 98 Opt(FirstName + Opt(MiddleName)) + \ 99 Opt(Initials) + \ 100 Opt(Suffix) 101 author_name = Alt(personal_name, CollectiveName) 102 103 104 ######################################## 105 # Dates 106 elements = [ 107 "Year", "Month", "Day", 108 "Season", "MedlineDate", 109 "Hour", "Minute", "Second" 110 ] 111 [simple_elem(e) for e in elements] 112 normal_date = Year + Month + Day + \ 113 Opt(Hour + Opt(Minute + Opt(Second))) 114 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate) 115 116 117 ######################################## 118 # NCBIArticle 119 120 simple_elem("NlmUniqueID") 121 simple_elem("PMID") 122 simple_elem("SubHeading", "MajorTopicYN") 123 simple_elem("Descriptor", "MajorTopicYN") 124 group_elem("MeshHeading", Descriptor + Rep(SubHeading)) 125 group_elem("MeshHeadingList", Rep1(MeshHeading)) 126 simple_elem("MedlinePgn") 127 simple_elem("EndPage") 128 simple_elem("StartPage") 129 group_elem("Pagination", 130 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn)) 131 simple_elem("CopyrightInformation") 132 simple_elem("AbstractText") 133 group_elem("Abstract", AbstractText + Opt(CopyrightInformation)) 134 135 simple_elem("Affiliation") 136 group_elem("Author", author_name + Opt(Affiliation)) 137 group_elem("AuthorList", Rep1(Author), "CompleteYN") 138 simple_elem("Language") 139 simple_elem("PublicationType") 140 group_elem("PublicationTypeList", Rep1(PublicationType)) 141 simple_elem("Title") # These were moved up, so that the definitions 142 simple_elem("Volume") # will be before Book. 143 simple_elem("VernacularTitle") 144 simple_elem("CollectionTitle") 145 simple_elem("ArticleTitle") 146 simple_elem("Publisher") 147 group_elem("PubDate", pub_date) 148 group_elem("Book", PubDate + Publisher + Title + 149 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume)) 150 simple_elem("Country") 151 simple_elem("MedlineTA") 152 simple_elem("MedlineCode") 153 group_elem("MedlineJournalInfo", 154 Country + MedlineTA + MedlineCode + Opt(NlmUniqueID)) 155 simple_elem("DateOfElectronicPublication") 156 simple_elem("ISOAbbreviation") 157 simple_elem("Coden") 158 simple_elem("Issue") 159 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate) 160 simple_elem("ISSN") 161 group_elem("Journal", 162 Opt(ISSN) + \ 163 JournalIssue + \ 164 Opt(Coden) + \ 165 Opt(Title) + \ 166 Opt(ISOAbbreviation) 167 ) 168 169 simple_elem("GrantID") 170 simple_elem("Acronym") 171 simple_elem("Agency") 172 group_elem("Grant", GrantID + Opt(Acronym) + Opt(Agency)) 173 group_elem("GrantList", Rep1(Grant), "CompleteYN") 174 simple_elem("AccessionNumber") 175 group_elem("AccessionNumberList", Rep1(AccessionNumber)) 176 simple_elem("DataBankName") 177 group_elem("DataBank", DataBankName + Opt(AccessionNumberList)) 178 group_elem("DataBankList", Rep1(DataBank), "CompleteYN") 179 180 group_elem("Article", 181 Alt(Journal, Book) + \ 182 ArticleTitle + \ 183 Pagination + \ 184 Opt(Abstract) + \ 185 Opt(Affiliation) + \ 186 Opt(AuthorList) + \ 187 Rep1(Language) + \ 188 Opt(DataBankList) + \ 189 Opt(GrantList) + \ 190 PublicationTypeList + \ 191 Opt(VernacularTitle) + \ 192 Opt(DateOfElectronicPublication) 193 ) 194 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo)) 195 196 197 198 199 200 201 ###################################################################### 202 # Implement Martel expressions that recognize: # 203 # http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_001211.dtd # 204 ###################################################################### 205 206 207 simple_elem("MedlineID") 208 209 simple_elem("Note") 210 simple_elem("RefSource") 211 Ref_template = RefSource + Opt(MedlineID) + Opt(Note) 212 213 214 ######################################## 215 # MedlineCitation 216 217 group_elem("CommentOn", Ref_template) 218 group_elem("CommentIn", Ref_template) 219 group_elem("ErratumIn", Ref_template) 220 group_elem("RepublishedFrom", Ref_template) 221 group_elem("RepublishedIn", Ref_template) 222 group_elem("RetractionOf", Ref_template) 223 group_elem("RetractionIn", Ref_template) 224 group_elem("UpdateIn", Ref_template) 225 group_elem("UpdateOf", Ref_template) 226 group_elem("CommentsCorrections", 227 Rep(CommentOn) + Rep(CommentIn) + \ 228 Rep(ErratumIn) + \ 229 Rep(RepublishedFrom) + Rep(RepublishedIn) + \ 230 Rep(RetractionOf) + Rep(RetractionIn) + \ 231 Rep(UpdateIn) + Rep(UpdateOf) 232 ) 233 simple_elem("NumberOfReferences") 234 group_elem("PersonalNameSubject", personal_name) 235 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject)) 236 simple_elem("GeneSymbol") 237 group_elem("GeneSymbolList", Rep1(GeneSymbol)) 238 simple_elem("NameOfSubstance") 239 simple_elem("CASRegistryNumber") 240 group_elem("Chemical", CASRegistryNumber + NameOfSubstance) 241 group_elem("ChemicalList", Rep1(Chemical)) 242 simple_elem("CitationSubset") 243 simple_elem("SpaceFlightMission") 244 simple_elem("SponsoringAgency") 245 simple_elem("ProcurementSource") 246 simple_elem("Keyword") 247 simple_elem("AbstractAuthor") 248 group_elem("OtherAbstract", Abstract + AbstractAuthor) 249 group_elem("AdditionalInformation", 250 Rep(OtherAbstract) + \ 251 Rep(Keyword) + \ 252 Rep(ProcurementSource) + \ 253 Rep(SponsoringAgency) + \ 254 Rep(SpaceFlightMission)) 255 group_elem("DateRevised", normal_date) 256 group_elem("DateCompleted", normal_date) 257 group_elem("DateCreated", normal_date) 258 group_elem("MedlineCitation", 259 MedlineID + \ 260 Opt(PMID) + \ 261 DateCreated + \ 262 Opt(DateCompleted) + \ 263 Opt(DateRevised) + \ 264 Article + \ 265 MedlineJournalInfo + \ 266 Opt(AdditionalInformation) + \ 267 Opt(ChemicalList) + \ 268 Rep(CitationSubset) + \ 269 Opt(CommentsCorrections) + \ 270 Opt(GeneSymbolList) + \ 271 Opt(MeshHeadingList) + \ 272 Opt(NumberOfReferences) + \ 273 Opt(PersonalNameSubjectList), 274 "CitationOwner" 275 ) 276 277 278 279 280 281 282 ###################################################################### 283 # Implement Martel expressions that recognize: # 284 # http://www.nlm.nih.gov/databases/dtd/nlmmedline_001211.dtd # 285 ###################################################################### 286 287 288 289 # The DeleteCitation tags start with spaces, so I have to make a 290 # special case for it. 291 space = Any(" \t") 292 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol() 293 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol() 294 295 # The file doesn't always end in a newline, so make MedlineCitationSet 296 # end in an optional Eol. 297 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol()) 298 299 300 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID))) 301 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation)) 302 303 304 305 306 307 ###################################################################### 308 # Other stuff # 309 # # 310 ###################################################################### 311 312 313 # Should match the proper dtd in here... 314 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol() 315 316 citation_format = MedlineCitation 317 318 # I'm going to use a RecordReader so that I can parse one record at a 319 # time, instead of sucking the whole XML file into memory. Each 320 # citation is going to be a record. Thus, the header is everything 321 # before the first citation and the footer is everything after the 322 # last citation. 323 header_format = Group("header", DOCTYPE + MedlineCitationSet_start) 324 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end 325 format = HeaderFooter( 326 None, {}, 327 header_format, RecordReader.Until, ("<MedlineCitation>",), 328 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",), 329 footer_format, RecordReader.Everything, (), 330 ) 331