Package Bio :: Package expressions :: Package swissprot :: Module sprot38
[hide private]
[frames] | no frames]

Source Code for Module Bio.expressions.swissprot.sprot38

  1  """Parser for the SWISS-PROT 38 format. 
  2   
  3  You probably want to use the variables 'record' (for a single record) 
  4  and 'format' (for a set of records). 
  5   
  6  """ 
  7   
  8  import warnings 
  9  warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 10   
 11   
 12  import Martel 
 13  from Martel import RecordReader, Time 
 14  from Bio import Std 
 15   
16 -def Simple(tag, tag_data):
17 return Martel.Group(tag, 18 Martel.Str(tag + " ") + \ 19 Martel.ToEol(tag_data) 20 )
21 #--- ID 22 23 ID = Martel.Group("ID", 24 Martel.Str("ID ") + \ 25 Std.dbid(Martel.Word("entry_name"), {"type": "primary", 26 "dbname": "sp"}) + \ 27 Martel.Spaces() + \ 28 Martel.Word("data_class_table") + \ 29 Martel.Str(";") + Martel.Spaces() + \ 30 Martel.Word("molecule_type") + \ 31 Martel.Str(";") + Martel.Spaces() + \ 32 Martel.Digits("sequence_length") + \ 33 Martel.Str(" AA.") + \ 34 Martel.AnyEol() 35 ) 36 #--- AC 37 38 AC = Martel.Group("AC", 39 Martel.Str("AC ") + \ 40 Std.dbid(Martel.Word("ac_number"), 41 {"type": "accession", 42 "dbname": "sp"}) + \ 43 Martel.Str(";") + \ 44 Martel.Rep(Martel.Str(" ") + \ 45 Std.dbid(Martel.Word("ac_number"), 46 {"type": "accession"}) + \ 47 Martel.Str(";")) + \ 48 Martel.AnyEol()) 49 50 AC_block = Martel.Group("AC_block", Martel.Rep1(AC)) 51 52 53 #--- DT 54 55 ##DT_created = Martel.Group("DT_created", Martel.Re( 56 ## r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 57 ## r"(?P<release>\d\d), Created\)\R" 58 ## )) 59 DT_created = Martel.Group("DT_created", 60 Martel.Str("DT ") + \ 61 Time.make_expression("%(DD)-%(Jan)-%(YYYY)") + \ 62 Martel.Re(" \(Rel. (?P<release>\d\d), Created\)\R")) 63 64 65 DT_seq_update = Martel.Group("DT_seq_update", Martel.Re( 66 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 67 r"(?P<release>\d\d), Last sequence update\)\R" 68 )) 69 70 DT_ann_update = Martel.Group("DT_ann_update", Martel.Re( 71 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 72 r"(?P<release>\d\d), Last annotation update\)\R" 73 )) 74 75 #--- DE 76 77 # Only the last DE is supposed to have a ".", but I don't see why *I* 78 # need to enforce it. 79 DE = Martel.Group("DE", 80 Martel.Str("DE ") + \ 81 Std.description(Martel.UntilEol("description")) + \ 82 Martel.AnyEol()) 83 84 DE_block = Std.description_block(Martel.Group("DE_block", Martel.Rep1(DE))) 85 86 87 #--- GN 88 89 GN = Simple("GN", "gene_names") 90 GN_block = Martel.Group("GN_block", Martel.Rep1(GN)) 91 92 #--- OS 93 94 OS = Simple("OS", "organism_species") 95 OS_block = Martel.Group("OS_block", Martel.Rep1(OS)) 96 97 98 99 #--- OG 100 101 OG = Simple("OG", "organelle") 102 OG_block = Martel.Group("OG_block", Martel.Rep1(OG)) 103 104 105 #--- OC 106 107 OC = Simple("OC", "organism_classification") 108 OC_block = Martel.Group("OC_block", Martel.Rep1(OC)) 109 110 ############ Reference section 111 112 #--- RN 113 114 # occurs once 115 RN = Martel.Group("RN", Martel.Re("RN \[(?P<reference_number>\d+)]\R")) 116 117 #--- RP 118 119 # occurs once 120 RP = Simple("RP", "reference_position") 121 122 123 #--- RC 124 125 # 0 or more 126 RC = Simple("RC", "reference_comment") 127 RC_block = Martel.Group("RC_block", Martel.Rep1(RC)) 128 129 #--- RX 130 131 # 0 or 1 132 RX = Martel.Group("RX", 133 Martel.Re("RX (?P<bibliographic_database_name>\w+); " \ 134 "(?P<bibliographic_identifier>\d+)\.\R")) 135 136 #--- RA 137 138 # 1 or more 139 RA = Simple("RA", "reference_author") 140 RA_block = Martel.Group("RA_block", Martel.Rep1(RA)) 141 142 143 #--- RT 144 145 # 0 or more 146 RT = Simple("RT", "reference_title") 147 RT_block = Martel.Group("RT_block", Martel.Rep1(RT)) 148 149 150 #--- RL 151 152 # 1 or more 153 154 RL = Simple("RL", "reference_location") 155 RL_block = Martel.Group("RL_block", Martel.Rep1(RL)) 156 157 reference = Martel.Group("reference", 158 RN + \ 159 RP + \ 160 Martel.Opt(RC_block) + \ 161 Martel.Opt(RX) + \ 162 RA_block + \ 163 Martel.Opt(RT_block) + \ 164 RL_block 165 ) 166 167 168 ############ 169 170 #--- CC 171 172 CC_begin = Martel.Group("CC", 173 Martel.Re("CC -!- ") + \ 174 Martel.ToEol("comment_text")) 175 CC = Martel.Group("CC", 176 Martel.Re("CC ") + \ 177 Martel.ToEol("comment_text")) 178 179 single_comment = Martel.Group("comment", 180 CC_begin + 181 Martel.Rep(CC) 182 ) 183 184 185 CC_copyright_begin = Martel.Group("CC_copyright_begin", 186 Martel.Re("CC -+\R")) 187 CC_copyright = Martel.Group("CC_copyright", 188 Martel.Re("CC (?!-+\R)") + \ 189 Martel.ToEol("copyright")) 190 CC_copyright_end = Martel.Group("CC_copyright_end", 191 Martel.Re("CC -+\R")) 192 193 # From N33_HUMAN 194 bogus_DR_group = Martel.Group("bogus_DR_block", 195 Martel.Re(r"(?P<DR>DR (?P<database_identifier>MIM); " \ 196 r"(?P<primary_identifier>601385); " \ 197 r"(?P<secondary_identifier>-).\R)") 198 ) 199 200 201 comment = Martel.Group("comment_block", 202 Martel.Rep(single_comment) + \ 203 Martel.Opt(bogus_DR_group) + \ 204 Martel.Opt(CC_copyright_begin + \ 205 Martel.Rep(CC_copyright) + \ 206 CC_copyright_end \ 207 ) 208 ) 209 210 #--- DR 211 212 # This is needed for things like 213 # DR MGD; MGI:95401; EPB4.1. 214 # where I need to scan up to the last "." That is, I want 215 # "EPB4.1" to be the secondary identifier, not "EPB4" nor "EPB4.1." 216 217 _to_secondary_end = Martel.Re(r"([^.\R]|(?!.\R)\.)+") 218 219 database_id = Std.dbxref_dbname(Martel.UntilSep("database_identifier", ";"), 220 {"style": "sp"}) 221 222 primary_id = Std.dbxref_dbid(Martel.UntilSep("primary_identifier", ";"), 223 {"type": "primary"}) 224 225 secondary_id = Std.dbxref_dbid(Martel.Group("secondary_identifier", 226 _to_secondary_end), 227 {"type": "accession"}) 228 229 # used in StdHandler for fast dxbref - don't rename! 230 real_DR_general = Std.dbxref(database_id + Martel.Str("; ") + \ 231 primary_id + Martel.Str("; ") + \ 232 secondary_id, 233 ) 234 fast_DR_general = Std.fast_dbxref(real_DR_general, 235 {"style": "sp-general"}) 236 237 DR_general = Martel.FastFeature(fast_DR_general, "fast-sp-dbxref", 238 real_DR_general.group_names() ) 239 240 241 # used in StdHandler for fast dxbref - don't rename! 242 real_DR_prosite = Std.dbxref( 243 Std.dbxref_dbname(Martel.Group("database_identifier", 244 Martel.Str("PROSITE", "PFAM")), 245 {"style": "sp"}) + 246 Martel.Str("; ") + 247 primary_id + 248 Martel.Str("; ") + 249 Std.dbxref_dbid(Martel.UntilSep(sep = ";"), {"type": "accession"}) + 250 Martel.Str("; ") + 251 Martel.UntilSep("status_identifier", "."), 252 ) 253 254 # used in StdHandler for fast dxbref - don't rename! 255 fast_DR_prosite = Std.fast_dbxref(real_DR_prosite, {"style": "sp-prosite"}) 256 257 DR_prosite = Martel.FastFeature(fast_DR_prosite, "fast-sp-dbxref", 258 real_DR_prosite.group_names()) 259 260 real_DR_embl = Std.dbxref( 261 Std.dbxref_dbname(Martel.Group("database_identifier", 262 Martel.Str("EMBL")), 263 {"style": "sp"}) + 264 Martel.Str("; ") + 265 primary_id + 266 Martel.Str("; ") + 267 Std.dbxref_dbid(Martel.UntilSep("secondary_identifier", ";"), 268 {"type": "accession"}) + 269 Martel.Str("; ") + 270 Martel.UntilSep("status_identifier", "."), 271 ) 272 273 fast_DR_embl = Std.fast_dbxref(real_DR_embl, {"style": "sp-embl"}) 274 DR_embl = Martel.FastFeature(fast_DR_embl, "fast-sp-dbxref", 275 real_DR_embl.group_names()) 276 277 DR = Martel.Group("DR", Martel.Str("DR ") + \ 278 Martel.Group("database_reference", 279 DR_embl | DR_prosite | DR_general) + \ 280 Martel.Str(".") + Martel.AnyEol()) 281 282 DR_block = Martel.Group("DR_block", Martel.Rep1(DR)) 283 284 285 286 #--- KW 287 288 KW = Simple("KW", "keyword") 289 KW_block = Martel.Group("KW_block", Martel.Rep1(KW)) 290 291 292 #--- FT 293 294 # FT DOMAIN 77 88 ASP/GLU-RICH (ACIDIC). 295 # 123456789012345678901234567890123456789012345678901234567890123456789012345 296 # 1 2 3 4 5 6 7 297 # FT ........ ...... ...... ......................................... 298 # FT 12345678 123456 123456 12345678901234567890123456789012345678901 299 # FT .{8} .{6} .{6} [^\R]* 300 # 1 1 1234567 301 302 # "FT " + ".{8}" + " " + ".{6}" + " " + ".{6}" + " " + "[^\R]*" + "\R" 303 # "FT .{8} .{6} .{6} [^\R]*\R" 304 305 ##FT_range = Martel.Group("FT", 306 ## Martel.Re("FT (?P<ft_name>.{8}) " \ 307 ## "(?P<ft_from>.{6}) (?P<ft_to>.{6})" \ 308 ## "( (?P<ft_description>[^\R]*))?\R") 309 ## ) 310 ##FT_continuation = Martel.Group("FT_continuation", 311 ## Martel.Re("FT " \ 312 ## "(?P<ft_description>[^\R]*)\R") 313 ## ) 314 ##FT = Martel.Group("feature", FT_range + Martel.Rep(FT_continuation)) 315 316 FT_name = Std.feature_name(Martel.Re(r".{8}")) 317 FT_start = Std.feature_location_start(Martel.Re(r".{6}")) 318 FT_end = Std.feature_location_end(Martel.Re(r".{6}")) 319 FT_desc = Std.feature_description(Martel.UntilEol()) 320 321 FT_range = Martel.Str("FT ") + \ 322 FT_name + \ 323 Martel.Str(" ") + \ 324 FT_start + \ 325 Martel.Str(" ") + \ 326 FT_end + \ 327 Martel.Opt(Martel.Str(" ") + \ 328 FT_desc) + \ 329 Martel.AnyEol() 330 331 FT_continuation = Martel.Str("FT ") + \ 332 FT_desc + \ 333 Martel.AnyEol() 334 335 FT = Std.feature(FT_range + Martel.Rep(FT_continuation), 336 {"location-style": "sp"}) 337 338 339 ##feature_block = Martel.Group("feature_block", Martel.Rep1(FT)) 340 feature_block = Std.feature_block(Martel.Rep1(FT), 341 {"style": "swissprot"}) 342 343 344 #--- SQ 345 346 # SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CRC32; 347 # (Those X's don't really indicate the size) 348 349 SQ = Martel.Group("SQ", 350 Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \ 351 " +(?P<molecular_weight>\d+) MW;" \ 352 " +(?P<crc?type=32>\w+) CRC32;\R") 353 ) 354 ##SQ_data = Martel.Group("SQ_data", 355 ## Martel.Re(" (?P<sequence>[^\R]*)\R")) 356 SQ_data = Martel.Str(" ") + \ 357 Std.sequence(Martel.UntilEol()) + \ 358 Martel.AnyEol() 359 360 361 ##sequence = Martel.Group("sequence_block", Martel.Group("SQ_data_block", 362 ## SQ + Martel.Rep(SQ_data))) 363 sequence = Std.sequence_block(SQ + Martel.Rep(SQ_data), 364 {"alphabet": "iupac-ambiguous-protein"}) 365 366 #--- // 367 368 end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol()) 369 370 ####################### put it all together 371 372 record = Std.record( 373 ID + 374 AC_block + 375 DT_created + 376 DT_seq_update + 377 DT_ann_update + 378 Martel.Opt(DE_block) + 379 Martel.Opt(GN_block) + 380 Martel.Opt(OS_block) + 381 Martel.Opt(OG_block) + 382 Martel.Opt(OC_block) + 383 Martel.Group("OX_block", Martel.NullOp()) + 384 Martel.Group("reference_block", Martel.Rep(reference)) + 385 comment + 386 Martel.Opt(DR_block) + 387 Martel.Opt(KW_block) + 388 Martel.Opt(feature_block) + 389 sequence + 390 end, 391 {"format": "swissprot/38"}) 392 393 394 format_expression = Martel.Group("dataset", Martel.Rep1(record), 395 {"format": "swissprot/38"}) 396 397 format = Martel.ParseRecords("dataset", {"format": "swissprot/38"}, 398 record, RecordReader.EndsWith, ("//\n",) ) 399 400 if __name__ == "__main__": 401 exp = Martel.select_names(format, ("entry_name", "sequence")) 402 parser = exp.make_parser() 403 parser.parseFile(open("/home/dalke/ftps/swissprot/sprot38.dat")) 404