Package Bio :: Package Medline :: Module nlmmedline_010319_format
[hide private]
[frames] | no frames]

Source Code for Module Bio.Medline.nlmmedline_010319_format

  1  """ 
  2  A Martel format to parse the NLM's XML format for Medline. 
  3   
  4  http://www.nlm.nih.gov/databases/dtd/nlmmedline_010319.dtd 
  5  http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_010319.dtd 
  6  http://www.nlm.nih.gov/databases/dtd/nlmcommon_010319.dtd 
  7   
  8  Formats: 
  9  citation_format    Format for one MedlineCitation. 
 10  format             Format for a whole file. 
 11   
 12  """ 
 13   
 14  import warnings 
 15  warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 16   
 17  import sys 
 18   
 19  from Martel import * 
 20  from Martel import RecordReader 
 21   
 22  self = sys.modules[__name__] 
 23   
24 -def _start_elem(element, *attrs):
25 if attrs: 26 attr_groups = [] 27 for attr in attrs: 28 group = Str(attr) + Str("=") + \ 29 Str('"') + Group(attr, Re(r'[^<&"]+')) + Str('"') 30 attr_groups.append(group) 31 start = Str("<") + Str(element) + \ 32 Rep(Str(" ") + Alt(*attr_groups)) + \ 33 Str(">") 34 else: 35 start = Str("<%s>" % element) 36 return start
37
38 -def _end_elem(element):
39 return Str("</%s>" % element)
40
41 -def simple_elem(element, *attrs):
42 """simple_elem(element, *attrs) 43 44 Create a Martel Expression in this module's namespace that will 45 recognize an XML element in the form of: 46 <element>data</element> 47 48 The whole element must be on a single line. The Expression will 49 be created in the module's namespace with the same name as the 50 element. 51 52 """ 53 start, end = _start_elem(element, *attrs), _end_elem(element) 54 55 group_name = element 56 group_expression = Re(r"[^<]+") 57 expr = start + \ 58 Group(group_name, group_expression) + \ 59 end + \ 60 AnyEol() 61 setattr(self, element, expr)
62 63 64 # Group expressions. A group consists of the start and end elements 65 # with an expression in-between. The Expression for the group will be 66 # called "NAME".
67 -def group_elem(element, expr, *attrs):
68 start_name, end_name = "%s_start" % element, "%s_end" % element 69 start_expr = getattr(self, start_name, None) 70 if start_expr is None: 71 start_expr = _start_elem(element, *attrs) + AnyEol() 72 setattr(self, start_name, start_expr) 73 end_expr = getattr(self, end_name, None) 74 if end_expr is None: 75 end_expr = _end_elem(element) + AnyEol() 76 setattr(self, end_name, end_expr) 77 78 group_expr = start_expr + expr + end_expr 79 group_expr = Group(element, group_expr) 80 setattr(self, element, group_expr)
81 82 83 ###################################################################### 84 # Implement Martel expressions that recognize: # 85 # http://www.nlm.nih.gov/databases/dtd/nlmcommon_010319.dtd # 86 ###################################################################### 87 88 ######################################## 89 # Personal and Author names 90 elements = [ 91 "FirstName", "MiddleName", "LastName", 92 "Initials", "Suffix", 93 "CollectiveName" 94 ] 95 [simple_elem(e) for e in elements] 96 personal_name = LastName + \ 97 Opt(FirstName + Opt(MiddleName)) + \ 98 Opt(Initials) + \ 99 Opt(Suffix) 100 author_name = Alt(personal_name, CollectiveName) 101 102 103 ######################################## 104 # Dates 105 elements = [ 106 "Year", "Month", "Day", 107 "Season", "MedlineDate", 108 "Hour", "Minute", "Second" 109 ] 110 [simple_elem(e) for e in elements] 111 normal_date = Year + Month + Day + \ 112 Opt(Hour + Opt(Minute + Opt(Second))) 113 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate) 114 115 116 simple_elem("CopyrightInformation") 117 simple_elem("AbstractText") 118 group_elem("Abstract", AbstractText + Opt(CopyrightInformation)) 119 120 ######################################## 121 # NCBIArticle 122 123 simple_elem("NlmUniqueID") 124 simple_elem("PMID") 125 simple_elem("SubHeading", "MajorTopicYN") 126 simple_elem("Descriptor", "MajorTopicYN") 127 group_elem("MeshHeading", Descriptor + Rep(SubHeading)) 128 group_elem("MeshHeadingList", Rep1(MeshHeading)) 129 simple_elem("MedlinePgn") 130 simple_elem("EndPage") 131 simple_elem("StartPage") 132 group_elem("Pagination", 133 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn)) 134 135 simple_elem("Affiliation") 136 group_elem("Author", author_name + Opt(Affiliation)) 137 group_elem("AuthorList", Rep1(Author), "CompleteYN") 138 simple_elem("Language") 139 simple_elem("PublicationType") 140 group_elem("PublicationTypeList", Rep1(PublicationType)) 141 simple_elem("Title") # These were moved up, so that the definitions 142 simple_elem("Volume") # will be before Book. 143 simple_elem("VernacularTitle") 144 simple_elem("CollectionTitle") 145 simple_elem("ArticleTitle") 146 simple_elem("Publisher") 147 group_elem("PubDate", pub_date) 148 group_elem("Book", PubDate + Publisher + Title + 149 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume)) 150 simple_elem("Country") 151 simple_elem("MedlineTA") 152 simple_elem("MedlineCode") 153 group_elem("MedlineJournalInfo", 154 Opt(Country) + MedlineTA + Opt(MedlineCode) + Opt(NlmUniqueID)) 155 simple_elem("DateOfElectronicPublication") 156 simple_elem("ISOAbbreviation") 157 simple_elem("Coden") 158 simple_elem("Issue") 159 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate) 160 simple_elem("ISSN") 161 group_elem("Journal", 162 Opt(ISSN) + \ 163 JournalIssue + \ 164 Opt(Coden) + \ 165 Opt(Title) + \ 166 Opt(ISOAbbreviation) 167 ) 168 169 simple_elem("GrantID") 170 simple_elem("Acronym") 171 simple_elem("Agency") 172 group_elem("Grant", Opt(GrantID) + Opt(Acronym) + Opt(Agency)) 173 group_elem("GrantList", Rep1(Grant), "CompleteYN") 174 simple_elem("AccessionNumber") 175 group_elem("AccessionNumberList", Rep1(AccessionNumber)) 176 simple_elem("DataBankName") 177 group_elem("DataBank", DataBankName + Opt(AccessionNumberList)) 178 group_elem("DataBankList", Rep1(DataBank), "CompleteYN") 179 180 group_elem("Article", 181 Alt(Journal, Book) + \ 182 ArticleTitle + \ 183 Pagination + \ 184 Opt(Abstract) + \ 185 Opt(Affiliation) + \ 186 Opt(AuthorList) + \ 187 Rep1(Language) + \ 188 Opt(DataBankList) + \ 189 Opt(GrantList) + \ 190 PublicationTypeList + \ 191 Opt(VernacularTitle) + \ 192 Opt(DateOfElectronicPublication) 193 ) 194 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo)) 195 196 197 198 199 200 201 ###################################################################### 202 # Implement Martel expressions that recognize: # 203 # http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_010319.dtd # 204 ###################################################################### 205 206 207 simple_elem("MedlineID") 208 209 simple_elem("Note") 210 simple_elem("RefSource") 211 Ref_template = RefSource + Opt(MedlineID) + Opt(Note) 212 213 214 ######################################## 215 # MedlineCitation 216 217 group_elem("OriginalReportIn", Ref_template) 218 group_elem("SummaryForPatientsIn", Ref_template) 219 group_elem("CommentOn", Ref_template) 220 group_elem("CommentIn", Ref_template) 221 group_elem("ErratumIn", Ref_template) 222 group_elem("RepublishedFrom", Ref_template) 223 group_elem("RepublishedIn", Ref_template) 224 group_elem("RetractionOf", Ref_template) 225 group_elem("RetractionIn", Ref_template) 226 group_elem("UpdateIn", Ref_template) 227 group_elem("UpdateOf", Ref_template) 228 group_elem("CommentsCorrections", 229 Rep(CommentOn) + Rep(CommentIn) + \ 230 Rep(ErratumIn) + \ 231 Rep(RepublishedFrom) + Rep(RepublishedIn) + \ 232 Rep(RetractionOf) + Rep(RetractionIn) + \ 233 Rep(UpdateIn) + Rep(UpdateOf) + \ 234 Rep(SummaryForPatientsIn) + Rep(OriginalReportIn) 235 ) 236 simple_elem("NumberOfReferences") 237 group_elem("PersonalNameSubject", personal_name) 238 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject)) 239 simple_elem("GeneSymbol") 240 group_elem("GeneSymbolList", Rep1(GeneSymbol)) 241 simple_elem("NameOfSubstance") 242 simple_elem("CASRegistryNumber") 243 group_elem("Chemical", CASRegistryNumber + NameOfSubstance) 244 group_elem("ChemicalList", Rep1(Chemical)) 245 simple_elem("CitationSubset") 246 simple_elem("GeneralNote", "Owner") 247 group_elem("Investigator", personal_name + Opt(Affiliation)) 248 group_elem("InvestigatorList", Rep1(Investigator)) 249 simple_elem("OtherID", "Source") 250 simple_elem("SpaceFlightMission") 251 simple_elem("Keyword", "MajorTopicYN") 252 group_elem("KeywordList", Rep1(Keyword), "Owner") 253 group_elem("OtherAbstract", 254 AbstractText + Opt(CopyrightInformation), 255 "Type") 256 group_elem("DateRevised", normal_date) 257 group_elem("DateCompleted", normal_date) 258 group_elem("DateCreated", normal_date) 259 group_elem("MedlineCitation", 260 MedlineID + \ 261 Opt(PMID) + \ 262 DateCreated + \ 263 Opt(DateCompleted) + \ 264 Opt(DateRevised) + \ 265 Article + \ 266 MedlineJournalInfo + \ 267 Opt(ChemicalList) + \ 268 Rep(CitationSubset) + \ 269 Opt(CommentsCorrections) + \ 270 Opt(GeneSymbolList) + \ 271 Opt(MeshHeadingList) + \ 272 Opt(NumberOfReferences) + \ 273 Opt(PersonalNameSubjectList) + \ 274 Rep(OtherID) + \ 275 Rep(OtherAbstract) + \ 276 Rep(KeywordList) + \ 277 Rep(SpaceFlightMission) + \ 278 Opt(InvestigatorList) + \ 279 Rep(GeneralNote), 280 "Owner" 281 ) 282 283 284 285 286 287 288 ###################################################################### 289 # Implement Martel expressions that recognize: # 290 # http://www.nlm.nih.gov/databases/dtd/nlmmedline_010319.dtd # 291 ###################################################################### 292 293 294 295 # The DeleteCitation tags start with spaces, so I have to make a 296 # special case for it. 297 space = Any(" \t") 298 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol() 299 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol() 300 301 # The file doesn't always end in a newline, so make MedlineCitationSet 302 # end in an optional Eol. 303 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol()) 304 305 306 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID))) 307 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation)) 308 309 310 311 312 313 ###################################################################### 314 # Other stuff # 315 # # 316 ###################################################################### 317 318 319 # Should match the proper dtd in here... 320 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol() 321 322 citation_format = MedlineCitation 323 324 # I'm going to use a RecordReader so that I can parse one record at a 325 # time, instead of sucking the whole XML file into memory. Each 326 # citation is going to be a record. Thus, the header is everything 327 # before the first citation and the footer is everything after the 328 # last citation. 329 header_format = Group("header", DOCTYPE + MedlineCitationSet_start) 330 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end 331 format = HeaderFooter( 332 None, {}, 333 # Unfortunately, RecordReader.Until doesn't work because some 334 # MedlineCitations have attributes are in the form 335 # <MedlineCitation Owner="NLM">. "<MedlineCitation" by itself 336 # won't differentiate between the beginning of a 337 # MedlineCitationSet or the beginning of a MedlineCitation. Thus, 338 # I'm just going to read the first 4 lines and hope that's the 339 # whole header. 340 #header_format, RecordReader.Until, ("<MedlineCitation>",), 341 header_format, RecordReader.CountLines, (4,), 342 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",), 343 footer_format, RecordReader.Everything, (), 344 ) 345