Package Martel :: Package test :: Package testformats :: Module swissprot38
[hide private]
[frames] | no frames]

Source Code for Module Martel.test.testformats.swissprot38

  1  """Parser for the SWISS-PROT 38 format. 
  2   
  3  You probably want to use the variables 'record' (for a single record) 
  4  and 'format' (for a set of records). 
  5   
  6  """ 
  7  import Martel 
  8  from Martel import RecordReader 
  9   
10 -def Simple(tag, tag_data):
11 return Martel.Group(tag, Martel.Str(tag + " ") + 12 Martel.Group(tag_data, Martel.Re("[^\R]*")) + 13 Martel.AnyEol() 14 )
15 #--- ID 16 17 ID = Martel.Group("ID", Martel.Re( 18 r"ID (?P<entry_name>\w+) +(?P<data_class_table>\w+); +" \ 19 r"(?P<molecule_type>\w+); +(?P<sequence_length>\d+) AA\.\R" 20 )) 21 22 #--- AC 23 24 AC = Martel.Group("AC", Martel.Re( 25 r"AC (?P<ac_number>\w+);( (?P<ac_number>\w+);)*\R" 26 )) 27 AC_block = Martel.Group("AC_block", Martel.Rep1(AC)) 28 29 30 #--- DT 31 32 DT_created = Martel.Group("DT_created", Martel.Re( 33 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 34 r"(?P<release>\d\d), Created\)\R" 35 )) 36 DT_seq_update = Martel.Group("DT_seq_update", Martel.Re( 37 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 38 r"(?P<release>\d\d), Last sequence update\)\R" 39 )) 40 DT_ann_update = Martel.Group("DT_ann_update", Martel.Re( 41 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\ 42 r"(?P<release>\d\d), Last annotation update\)\R" 43 )) 44 45 #--- DE 46 47 # Only the last DE is supposed to have a ".", but I don't see why *I* 48 # need to enforce it. 49 DE = Simple("DE", "description") 50 DE_block = Martel.Group("DE_block", Martel.Rep1(DE)) 51 52 53 #--- GN 54 55 GN = Simple("GN", "gene_names") 56 GN_block = Martel.Group("GN_block", Martel.Rep1(GN)) 57 58 #--- OS 59 60 OS = Simple("OS", "organism_species") 61 OS_block = Martel.Group("OS_block", Martel.Rep1(OS)) 62 63 64 65 #--- OG 66 67 OG = Simple("OG", "organelle") 68 OG_block = Martel.Group("OG_block", Martel.Rep1(OG)) 69 70 71 #--- OC 72 73 OC = Simple("OC", "organism_classification") 74 OC_block = Martel.Group("OC_block", Martel.Rep1(OC)) 75 76 ############ Reference section 77 78 #--- RN 79 80 # occurs once 81 RN = Martel.Group("RN", Martel.Re("RN \[(?P<reference_number>\d+)]\R")) 82 83 #--- RP 84 85 # occurs once 86 RP = Simple("RP", "reference_position") 87 88 89 #--- RC 90 91 # 0 or more 92 RC = Simple("RC", "reference_comment") 93 RC_block = Martel.Group("RC_block", Martel.Rep1(RC)) 94 95 #--- RX 96 97 # 0 or 1 98 RX = Martel.Group("RX", 99 Martel.Re("RX (?P<bibliographic_database_name>\w+); " \ 100 "(?P<bibliographic_identifier>\d+)\.\R")) 101 102 #--- RA 103 104 # 1 or more 105 RA = Simple("RA", "reference_author") 106 RA_block = Martel.Group("RA_block", Martel.Rep1(RA)) 107 108 109 #--- RT 110 111 # 0 or more 112 RT = Simple("RT", "reference_title") 113 RT_block = Martel.Group("RT_block", Martel.Rep1(RT)) 114 115 116 #--- RL 117 118 # 1 or more 119 120 RL = Simple("RL", "reference_location") 121 RL_block = Martel.Group("RL_block", Martel.Rep1(RL)) 122 123 reference = Martel.Group("reference", 124 RN + \ 125 RP + \ 126 Martel.Opt(RC_block) + \ 127 Martel.Opt(RX) + \ 128 RA_block + \ 129 Martel.Opt(RT_block) + \ 130 RL_block 131 ) 132 133 134 ############ 135 136 #--- CC 137 138 CC_begin = Martel.Group("CC", 139 Martel.Re("CC -!- ") + \ 140 Martel.ToEol("comment_text")) 141 CC = Martel.Group("CC", 142 Martel.Re("CC ") + \ 143 Martel.ToEol("comment_text")) 144 145 single_comment = Martel.Group("comment", 146 CC_begin + 147 Martel.Rep(CC) 148 ) 149 150 151 CC_copyright_begin = Martel.Group("CC_copyright_begin", 152 Martel.Re("CC -+\R")) 153 CC_copyright = Martel.Group("CC_copyright", 154 Martel.Re("CC (?!-+\R)") + \ 155 Martel.ToEol("copyright")) 156 CC_copyright_end = Martel.Group("CC_copyright_end", 157 Martel.Re("CC -+\R")) 158 159 # From N33_HUMAN 160 bogus_DR_group = Martel.Group("bogus_DR_block", 161 Martel.Re(r"(?P<DR>DR (?P<database_identifier>MIM); " \ 162 r"(?P<primary_identifier>601385); " \ 163 r"(?P<secondary_identifier>-).\R)") 164 ) 165 166 167 comment = Martel.Group("comment_block", 168 Martel.Rep(single_comment) + \ 169 Martel.Opt(bogus_DR_group) + \ 170 Martel.Opt(CC_copyright_begin + \ 171 Martel.Rep(CC_copyright) + \ 172 CC_copyright_end \ 173 ) 174 ) 175 176 #--- DR 177 178 # The ([^.\R]|(?!.\R)\.)+) is because of things like 179 # DR MGD; MGI:95401; EPB4.1. 180 # where I need to scan up to the last "." 181 182 DR_general = Martel.Re("(?P<database_identifier>[^;]+);" \ 183 "(?P<primary_identifier>[^;]+); " \ 184 "(?P<secondary_identifier>([^.\R]|(?!.\R)\.)+)") 185 186 DR_prosite = Martel.Re("(?P<database_identifier>(PROSITE|PFAM)); " \ 187 "(?P<primary_identifier>[^;]+); " \ 188 "(?P<secondary_identifier>[^;]+); " \ 189 "(?P<status_identifier>[^.]+)") 190 191 DR_embl = Martel.Re("(?P<database_identifier>EMBL); " \ 192 "(?P<primary_identifier>[^;]+); " \ 193 "(?P<secondary_identifier>[^;]+); " \ 194 "(?P<status_identifier>[^.]+)") 195 196 DR = Martel.Group("DR", Martel.Str("DR ") + \ 197 Martel.Group("database_reference", 198 Martel.Alt(DR_embl, DR_prosite, DR_general)) + \ 199 Martel.Str(".") + Martel.AnyEol()) 200 201 DR_block = Martel.Group("DR_block", Martel.Rep1(DR)) 202 203 204 205 #--- KW 206 207 KW = Simple("KW", "keyword") 208 KW_block = Martel.Group("KW_block", Martel.Rep1(KW)) 209 210 211 #--- FT 212 213 # FT DOMAIN 77 88 ASP/GLU-RICH (ACIDIC). 214 # 123456789012345678901234567890123456789012345678901234567890123456789012345 215 # 1 2 3 4 5 6 7 216 # FT ........ ...... ...... ......................................... 217 # FT 12345678 123456 123456 12345678901234567890123456789012345678901 218 # FT .{8} .{6} .{6} [^\R]* 219 # 1 1 1234567 220 221 # "FT " + ".{8}" + " " + ".{6}" + " " + ".{6}" + " " + "[^\R]*" + "\R" 222 # "FT .{8} .{6} .{6} [^\R]*\R" 223 224 FT_range = Martel.Group("FT", 225 Martel.Re("FT (?P<ft_name>.{8}) " \ 226 "(?P<ft_from>.{6}) (?P<ft_to>.{6})" \ 227 "( (?P<ft_description>[^\R]*))?\R") 228 ) 229 FT_continuation = Martel.Group("FT_continuation", 230 Martel.Re("FT " \ 231 "(?P<ft_description>[^\R]*)\R") 232 ) 233 FT = Martel.Group("feature", FT_range + Martel.Rep(FT_continuation)) 234 235 feature_block = Martel.Group("feature_block", Martel.Rep1(FT)) 236 237 238 #--- SQ 239 240 # SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CRC32; 241 # (Those X's don't really indicate the size) 242 243 SQ = Martel.Group("SQ", 244 Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \ 245 " +(?P<molecular_weight>\d+) MW;" \ 246 " +(?P<crc32>\w+) CRC32;\R") 247 ) 248 SQ_data = Martel.Group("SQ_data", 249 Martel.Re(" (?P<sequence>[^\R]*)\R")) 250 251 sequence = Martel.Group("sequence_block", Martel.Group("SQ_data_block", 252 SQ + Martel.Rep(SQ_data))) 253 254 #--- // 255 256 end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol()) 257 258 ####################### put it all together 259 260 record = Martel.Group("swissprot38_record", \ 261 ID + \ 262 AC + \ 263 DT_created + \ 264 DT_seq_update + \ 265 DT_ann_update + \ 266 Martel.Opt(DE_block) + \ 267 Martel.Opt(GN_block) + \ 268 Martel.Opt(OS_block) + \ 269 Martel.Opt(OG_block) + \ 270 Martel.Opt(OC_block) + \ 271 Martel.Group("reference_block", Martel.Rep(reference)) + \ 272 comment + \ 273 Martel.Opt(DR_block) + \ 274 Martel.Opt(KW_block) + \ 275 Martel.Opt(feature_block) + \ 276 sequence + \ 277 end 278 ) 279 280 format_expression = Martel.Group("swissprot38", Martel.Rep1(record)) 281 282 format = Martel.ParseRecords("swissprot38", {}, record, 283 RecordReader.EndsWith, ("//\n",) ) 284