Package Bio :: Package Medline :: Module nlmmedline_011101_format
[hide private]
[frames] | no frames]

Source Code for Module Bio.Medline.nlmmedline_011101_format

  1  """nlmmedline_xml_format.py 
  2   
  3  A Martel format to parse the NLM's XML format for Medline. 
  4   
  5  http://www.nlm.nih.gov/databases/dtd/nlmmedline_011101.dtd 
  6  http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_011101.dtd 
  7  http://www.nlm.nih.gov/databases/dtd/nlmcommon_011101.dtd 
  8   
  9  Formats: 
 10  citation_format    Format for one MedlineCitation. 
 11  format             Format for a whole file. 
 12   
 13  """ 
 14   
 15  import warnings 
 16  warnings.warn("Bio.Medline.NLMMedlineXML was deprecated, as it does not seem to be able to parse recent Medline XML files. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 17   
 18  import sys 
 19   
 20  from Martel import * 
 21  from Martel import RecordReader 
 22   
 23  self = sys.modules[__name__] 
 24   
25 -def _start_elem(element, *attrs):
26 if attrs: 27 attr_groups = [] 28 for attr in attrs: 29 group = Str(attr) + Str("=") + \ 30 Str('"') + Group(attr, Re(r'[^<&"]+')) + Str('"') 31 attr_groups.append(group) 32 start = Str("<") + Str(element) + \ 33 Rep(Str(" ") + Alt(*attr_groups)) + \ 34 Str(">") 35 else: 36 start = Str("<%s>" % element) 37 return start
38
39 -def _end_elem(element):
40 return Str("</%s>" % element)
41
42 -def simple_elem(element, *attrs):
43 """simple_elem(element, *attrs) 44 45 Create a Martel Expression in this module's namespace that will 46 recognize an XML element in the form of: 47 <element>data</element> 48 49 The whole element must be on a single line. The Expression will 50 be created in the module's namespace with the same name as the 51 element. 52 53 """ 54 start, end = _start_elem(element, *attrs), _end_elem(element) 55 56 group_name = element 57 group_expression = Re(r"[^<]+") 58 expr = start + \ 59 Group(group_name, group_expression) + \ 60 end + \ 61 AnyEol() 62 setattr(self, element, expr)
63 64 65 # Group expressions. A group consists of the start and end elements 66 # with an expression in-between. The Expression for the group will be 67 # called "NAME".
68 -def group_elem(element, expr, *attrs):
69 start_name, end_name = "%s_start" % element, "%s_end" % element 70 start_expr = getattr(self, start_name, None) 71 if start_expr is None: 72 start_expr = _start_elem(element, *attrs) + AnyEol() 73 setattr(self, start_name, start_expr) 74 end_expr = getattr(self, end_name, None) 75 if end_expr is None: 76 end_expr = _end_elem(element) + AnyEol() 77 setattr(self, end_name, end_expr) 78 79 group_expr = start_expr + expr + end_expr 80 group_expr = Group(element, group_expr) 81 setattr(self, element, group_expr)
82 83 84 ###################################################################### 85 # Implement Martel expressions that recognize: # 86 # http://www.nlm.nih.gov/databases/dtd/nlmcommon_011101.dtd # 87 ###################################################################### 88 89 ######################################## 90 # Personal and Author names 91 elements = [ 92 "FirstName", "ForeName", "MiddleName", "LastName", 93 "Initials", "Suffix", 94 "CollectiveName" 95 ] 96 [simple_elem(e) for e in elements] 97 personal_name = LastName + \ 98 Opt(Alt(ForeName, FirstName + Opt(MiddleName))) + \ 99 Opt(Initials) + \ 100 Opt(Suffix) 101 author_name = Alt(personal_name, CollectiveName) 102 103 104 ######################################## 105 # Dates 106 elements = [ 107 "Year", "Month", "Day", 108 "Season", "MedlineDate", 109 "Hour", "Minute", "Second" 110 ] 111 [simple_elem(e) for e in elements] 112 normal_date = Year + Month + Day + \ 113 Opt(Hour + Opt(Minute + Opt(Second))) 114 pub_date = Alt((Year + Opt(Alt((Month + Opt(Day)), Season))), MedlineDate) 115 116 117 simple_elem("CopyrightInformation") 118 simple_elem("AbstractText") 119 group_elem("Abstract", AbstractText + Opt(CopyrightInformation)) 120 121 ######################################## 122 # NCBIArticle 123 124 simple_elem("NlmUniqueID") 125 simple_elem("PMID") 126 simple_elem("SubHeading", "MajorTopicYN") 127 simple_elem("QualifierName", "MajorTopicYN") 128 simple_elem("Descriptor", "MajorTopicYN") 129 simple_elem("DescriptorName", "MajorTopicYN") 130 group_elem("MeshHeading", 131 Alt(DescriptorName, Descriptor) + \ 132 Alt(Rep(QualifierName), Rep(SubHeading))) 133 group_elem("MeshHeadingList", Rep1(MeshHeading)) 134 simple_elem("MedlinePgn") 135 simple_elem("EndPage") 136 simple_elem("StartPage") 137 group_elem("Pagination", 138 Alt(StartPage + Opt(EndPage) + Opt(MedlinePgn), MedlinePgn)) 139 140 simple_elem("Affiliation") 141 group_elem("Author", author_name + Opt(Affiliation)) 142 group_elem("AuthorList", Rep1(Author), "CompleteYN") 143 simple_elem("Language") 144 simple_elem("PublicationType") 145 group_elem("PublicationTypeList", Rep1(PublicationType)) 146 simple_elem("Title") # These were moved up, so that the definitions 147 simple_elem("Volume") # will be before Book. 148 simple_elem("VernacularTitle") 149 simple_elem("CollectionTitle") 150 simple_elem("ArticleTitle") 151 simple_elem("Publisher") 152 group_elem("PubDate", pub_date) 153 group_elem("Book", PubDate + Publisher + Title + 154 Opt(AuthorList) + Opt(CollectionTitle) + Opt(Volume)) 155 simple_elem("Country") 156 simple_elem("MedlineTA") 157 simple_elem("MedlineCode") 158 group_elem("MedlineJournalInfo", 159 Opt(Country) + MedlineTA + Opt(MedlineCode) + Opt(NlmUniqueID)) 160 simple_elem("DateOfElectronicPublication") 161 simple_elem("ISOAbbreviation") 162 simple_elem("Coden") 163 simple_elem("Issue") 164 group_elem("JournalIssue", Opt(Volume) + Opt(Issue) + PubDate) 165 simple_elem("ISSN") 166 group_elem("Journal", 167 Opt(ISSN) + \ 168 JournalIssue + \ 169 Opt(Coden) + \ 170 Opt(Title) + \ 171 Opt(ISOAbbreviation) 172 ) 173 174 simple_elem("GrantID") 175 simple_elem("Acronym") 176 simple_elem("Agency") 177 group_elem("Grant", Opt(GrantID) + Opt(Acronym) + Opt(Agency)) 178 group_elem("GrantList", Rep1(Grant), "CompleteYN") 179 simple_elem("AccessionNumber") 180 group_elem("AccessionNumberList", Rep1(AccessionNumber)) 181 simple_elem("DataBankName") 182 group_elem("DataBank", DataBankName + Opt(AccessionNumberList)) 183 group_elem("DataBankList", Rep1(DataBank), "CompleteYN") 184 185 group_elem("Article", 186 Alt(Journal, Book) + \ 187 ArticleTitle + \ 188 Pagination + \ 189 Opt(Abstract) + \ 190 Opt(Affiliation) + \ 191 Opt(AuthorList) + \ 192 Rep1(Language) + \ 193 Opt(DataBankList) + \ 194 Opt(GrantList) + \ 195 PublicationTypeList + \ 196 Opt(VernacularTitle) + \ 197 Opt(DateOfElectronicPublication) 198 ) 199 group_elem("NCBIArticle", PMID + Article + Opt(MedlineJournalInfo)) 200 201 202 203 204 205 206 ###################################################################### 207 # Implement Martel expressions that recognize: # 208 # http://www.nlm.nih.gov/databases/dtd/nlmmedlinecitation_011101.dtd # 209 ###################################################################### 210 211 212 simple_elem("MedlineID") 213 214 simple_elem("Note") 215 simple_elem("RefSource") 216 Ref_template = RefSource + Opt(Alt(PMID, MedlineID)) + Opt(Note) 217 218 219 ######################################## 220 # MedlineCitation 221 222 group_elem("OriginalReportIn", Ref_template) 223 group_elem("SummaryForPatientsIn", Ref_template) 224 group_elem("CommentOn", Ref_template) 225 group_elem("CommentIn", Ref_template) 226 group_elem("ErratumIn", Ref_template) 227 group_elem("RepublishedFrom", Ref_template) 228 group_elem("RepublishedIn", Ref_template) 229 group_elem("RetractionOf", Ref_template) 230 group_elem("RetractionIn", Ref_template) 231 group_elem("UpdateIn", Ref_template) 232 group_elem("UpdateOf", Ref_template) 233 group_elem("CommentsCorrections", 234 Rep(CommentOn) + Rep(CommentIn) + \ 235 Rep(ErratumIn) + \ 236 Rep(RepublishedFrom) + Rep(RepublishedIn) + \ 237 Rep(RetractionOf) + Rep(RetractionIn) + \ 238 Rep(UpdateIn) + Rep(UpdateOf) + \ 239 Rep(SummaryForPatientsIn) + Rep(OriginalReportIn) 240 ) 241 simple_elem("NumberOfReferences") 242 group_elem("PersonalNameSubject", personal_name) 243 group_elem("PersonalNameSubjectList", Rep1(PersonalNameSubject)) 244 simple_elem("GeneSymbol") 245 group_elem("GeneSymbolList", Rep1(GeneSymbol)) 246 simple_elem("NameOfSubstance") 247 simple_elem("CASRegistryNumber") 248 simple_elem("RegistryNumber") 249 group_elem("Chemical", Alt(CASRegistryNumber, RegistryNumber) + \ 250 NameOfSubstance) 251 group_elem("ChemicalList", Rep1(Chemical)) 252 simple_elem("CitationSubset") 253 simple_elem("GeneralNote", "Owner") 254 group_elem("Investigator", personal_name + Opt(Affiliation)) 255 group_elem("InvestigatorList", Rep1(Investigator)) 256 simple_elem("OtherID", "Source") 257 simple_elem("SpaceFlightMission") 258 simple_elem("Keyword", "MajorTopicYN") 259 group_elem("KeywordList", Rep1(Keyword), "Owner") 260 group_elem("OtherAbstract", 261 AbstractText + Opt(CopyrightInformation), 262 "Type") 263 group_elem("DateRevised", normal_date) 264 group_elem("DateCompleted", normal_date) 265 group_elem("DateCreated", normal_date) 266 group_elem("MedlineCitation", 267 Opt(MedlineID) + \ 268 Opt(PMID) + \ 269 DateCreated + \ 270 Opt(DateCompleted) + \ 271 Opt(DateRevised) + \ 272 Article + \ 273 MedlineJournalInfo + \ 274 Opt(ChemicalList) + \ 275 Rep(CitationSubset) + \ 276 Opt(CommentsCorrections) + \ 277 Opt(GeneSymbolList) + \ 278 Opt(MeshHeadingList) + \ 279 Opt(NumberOfReferences) + \ 280 Opt(PersonalNameSubjectList) + \ 281 Rep(OtherID) + \ 282 Rep(OtherAbstract) + \ 283 Rep(KeywordList) + \ 284 Rep(SpaceFlightMission) + \ 285 Opt(InvestigatorList) + \ 286 Rep(GeneralNote), 287 "Owner", "Status" 288 ) 289 290 291 292 293 294 295 ###################################################################### 296 # Implement Martel expressions that recognize: # 297 # http://www.nlm.nih.gov/databases/dtd/nlmmedline_011101.dtd # 298 ###################################################################### 299 300 301 302 # The DeleteCitation tags start with spaces, so I have to make a 303 # special case for it. 304 space = Any(" \t") 305 DeleteCitation_start = Rep(space) + Str("<DeleteCitation>") + AnyEol() 306 DeleteCitation_end = Rep(space) + Str("</DeleteCitation>") + AnyEol() 307 308 # The file doesn't always end in a newline, so make MedlineCitationSet 309 # end in an optional Eol. 310 MedlineCitationSet_end = Str("</MedlineCitationSet>") + Opt(AnyEol()) 311 312 313 group_elem("DeleteCitation", Alt(Rep1(MedlineID), Rep1(PMID))) 314 group_elem("MedlineCitationSet", Rep(MedlineCitation) + Opt(DeleteCitation)) 315 316 317 318 319 320 ###################################################################### 321 # Other stuff # 322 # # 323 ###################################################################### 324 325 326 # Should match the proper dtd in here... 327 DOCTYPE = Str("<!DOCTYPE") + Re(r"[^>]+") + Str(">") + AnyEol() 328 329 citation_format = MedlineCitation 330 331 # I'm going to use a RecordReader so that I can parse one record at a 332 # time, instead of sucking the whole XML file into memory. Each 333 # citation is going to be a record. Thus, the header is everything 334 # before the first citation and the footer is everything after the 335 # last citation. 336 header_format = Group("header", DOCTYPE + MedlineCitationSet_start) 337 footer_format = Opt(DeleteCitation) + MedlineCitationSet_end 338 format = HeaderFooter( 339 None, {}, 340 # Unfortunately, RecordReader.Until doesn't work because some 341 # MedlineCitations have attributes are in the form 342 # <MedlineCitation Owner="NLM">. "<MedlineCitation" by itself 343 # won't differentiate between the beginning of a 344 # MedlineCitationSet or the beginning of a MedlineCitation. Thus, 345 # I'm just going to read the first 4 lines and hope that's the 346 # whole header. 347 #header_format, RecordReader.Until, ("<MedlineCitation>",), 348 header_format, RecordReader.CountLines, (4,), 349 citation_format, RecordReader.EndsWith, ("</MedlineCitation>",), 350 footer_format, RecordReader.Everything, (), 351 ) 352