1 """Parser for the SWISS-PROT 38 format.
2
3 You probably want to use the variables 'record' (for a single record)
4 and 'format' (for a set of records).
5
6 """
7
8 import warnings
9 warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
10
11
12 import Martel
13 from Martel import RecordReader, Time
14 from Bio import Std
15
21
22
23 ID = Martel.Group("ID",
24 Martel.Str("ID ") + \
25 Std.dbid(Martel.Word("entry_name"), {"type": "primary",
26 "dbname": "sp"}) + \
27 Martel.Spaces() + \
28 Martel.Word("data_class_table") + \
29 Martel.Str(";") + Martel.Spaces() + \
30 Martel.Word("molecule_type") + \
31 Martel.Str(";") + Martel.Spaces() + \
32 Martel.Digits("sequence_length") + \
33 Martel.Str(" AA.") + \
34 Martel.AnyEol()
35 )
36
37
38 AC = Martel.Group("AC",
39 Martel.Str("AC ") + \
40 Std.dbid(Martel.Word("ac_number"),
41 {"type": "accession",
42 "dbname": "sp"}) + \
43 Martel.Str(";") + \
44 Martel.Rep(Martel.Str(" ") + \
45 Std.dbid(Martel.Word("ac_number"),
46 {"type": "accession"}) + \
47 Martel.Str(";")) + \
48 Martel.AnyEol())
49
50 AC_block = Martel.Group("AC_block", Martel.Rep1(AC))
51
52
53
54
55
56
57
58
59 DT_created = Martel.Group("DT_created",
60 Martel.Str("DT ") + \
61 Time.make_expression("%(DD)-%(Jan)-%(YYYY)") + \
62 Martel.Re(" \(Rel. (?P<release>\d\d), Created\)\R"))
63
64
65 DT_seq_update = Martel.Group("DT_seq_update", Martel.Re(
66 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
67 r"(?P<release>\d\d), Last sequence update\)\R"
68 ))
69
70 DT_ann_update = Martel.Group("DT_ann_update", Martel.Re(
71 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
72 r"(?P<release>\d\d), Last annotation update\)\R"
73 ))
74
75
76
77
78
79 DE = Martel.Group("DE",
80 Martel.Str("DE ") + \
81 Std.description(Martel.UntilEol("description")) + \
82 Martel.AnyEol())
83
84 DE_block = Std.description_block(Martel.Group("DE_block", Martel.Rep1(DE)))
85
86
87
88
89 GN = Simple("GN", "gene_names")
90 GN_block = Martel.Group("GN_block", Martel.Rep1(GN))
91
92
93
94 OS = Simple("OS", "organism_species")
95 OS_block = Martel.Group("OS_block", Martel.Rep1(OS))
96
97
98
99
100
101 OG = Simple("OG", "organelle")
102 OG_block = Martel.Group("OG_block", Martel.Rep1(OG))
103
104
105
106
107 OC = Simple("OC", "organism_classification")
108 OC_block = Martel.Group("OC_block", Martel.Rep1(OC))
109
110
111
112
113
114
115 RN = Martel.Group("RN", Martel.Re("RN \[(?P<reference_number>\d+)]\R"))
116
117
118
119
120 RP = Simple("RP", "reference_position")
121
122
123
124
125
126 RC = Simple("RC", "reference_comment")
127 RC_block = Martel.Group("RC_block", Martel.Rep1(RC))
128
129
130
131
132 RX = Martel.Group("RX",
133 Martel.Re("RX (?P<bibliographic_database_name>\w+); " \
134 "(?P<bibliographic_identifier>\d+)\.\R"))
135
136
137
138
139 RA = Simple("RA", "reference_author")
140 RA_block = Martel.Group("RA_block", Martel.Rep1(RA))
141
142
143
144
145
146 RT = Simple("RT", "reference_title")
147 RT_block = Martel.Group("RT_block", Martel.Rep1(RT))
148
149
150
151
152
153
154 RL = Simple("RL", "reference_location")
155 RL_block = Martel.Group("RL_block", Martel.Rep1(RL))
156
157 reference = Martel.Group("reference",
158 RN + \
159 RP + \
160 Martel.Opt(RC_block) + \
161 Martel.Opt(RX) + \
162 RA_block + \
163 Martel.Opt(RT_block) + \
164 RL_block
165 )
166
167
168
169
170
171
172 CC_begin = Martel.Group("CC",
173 Martel.Re("CC -!- ") + \
174 Martel.ToEol("comment_text"))
175 CC = Martel.Group("CC",
176 Martel.Re("CC ") + \
177 Martel.ToEol("comment_text"))
178
179 single_comment = Martel.Group("comment",
180 CC_begin +
181 Martel.Rep(CC)
182 )
183
184
185 CC_copyright_begin = Martel.Group("CC_copyright_begin",
186 Martel.Re("CC -+\R"))
187 CC_copyright = Martel.Group("CC_copyright",
188 Martel.Re("CC (?!-+\R)") + \
189 Martel.ToEol("copyright"))
190 CC_copyright_end = Martel.Group("CC_copyright_end",
191 Martel.Re("CC -+\R"))
192
193
194 bogus_DR_group = Martel.Group("bogus_DR_block",
195 Martel.Re(r"(?P<DR>DR (?P<database_identifier>MIM); " \
196 r"(?P<primary_identifier>601385); " \
197 r"(?P<secondary_identifier>-).\R)")
198 )
199
200
201 comment = Martel.Group("comment_block",
202 Martel.Rep(single_comment) + \
203 Martel.Opt(bogus_DR_group) + \
204 Martel.Opt(CC_copyright_begin + \
205 Martel.Rep(CC_copyright) + \
206 CC_copyright_end \
207 )
208 )
209
210
211
212
213
214
215
216
217 _to_secondary_end = Martel.Re(r"([^.\R]|(?!.\R)\.)+")
218
219 database_id = Std.dbxref_dbname(Martel.UntilSep("database_identifier", ";"),
220 {"style": "sp"})
221
222 primary_id = Std.dbxref_dbid(Martel.UntilSep("primary_identifier", ";"),
223 {"type": "primary"})
224
225 secondary_id = Std.dbxref_dbid(Martel.Group("secondary_identifier",
226 _to_secondary_end),
227 {"type": "accession"})
228
229
230 real_DR_general = Std.dbxref(database_id + Martel.Str("; ") + \
231 primary_id + Martel.Str("; ") + \
232 secondary_id,
233 )
234 fast_DR_general = Std.fast_dbxref(real_DR_general,
235 {"style": "sp-general"})
236
237 DR_general = Martel.FastFeature(fast_DR_general, "fast-sp-dbxref",
238 real_DR_general.group_names() )
239
240
241
242 real_DR_prosite = Std.dbxref(
243 Std.dbxref_dbname(Martel.Group("database_identifier",
244 Martel.Str("PROSITE", "PFAM")),
245 {"style": "sp"}) +
246 Martel.Str("; ") +
247 primary_id +
248 Martel.Str("; ") +
249 Std.dbxref_dbid(Martel.UntilSep(sep = ";"), {"type": "accession"}) +
250 Martel.Str("; ") +
251 Martel.UntilSep("status_identifier", "."),
252 )
253
254
255 fast_DR_prosite = Std.fast_dbxref(real_DR_prosite, {"style": "sp-prosite"})
256
257 DR_prosite = Martel.FastFeature(fast_DR_prosite, "fast-sp-dbxref",
258 real_DR_prosite.group_names())
259
260 real_DR_embl = Std.dbxref(
261 Std.dbxref_dbname(Martel.Group("database_identifier",
262 Martel.Str("EMBL")),
263 {"style": "sp"}) +
264 Martel.Str("; ") +
265 primary_id +
266 Martel.Str("; ") +
267 Std.dbxref_dbid(Martel.UntilSep("secondary_identifier", ";"),
268 {"type": "accession"}) +
269 Martel.Str("; ") +
270 Martel.UntilSep("status_identifier", "."),
271 )
272
273 fast_DR_embl = Std.fast_dbxref(real_DR_embl, {"style": "sp-embl"})
274 DR_embl = Martel.FastFeature(fast_DR_embl, "fast-sp-dbxref",
275 real_DR_embl.group_names())
276
277 DR = Martel.Group("DR", Martel.Str("DR ") + \
278 Martel.Group("database_reference",
279 DR_embl | DR_prosite | DR_general) + \
280 Martel.Str(".") + Martel.AnyEol())
281
282 DR_block = Martel.Group("DR_block", Martel.Rep1(DR))
283
284
285
286
287
288 KW = Simple("KW", "keyword")
289 KW_block = Martel.Group("KW_block", Martel.Rep1(KW))
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316 FT_name = Std.feature_name(Martel.Re(r".{8}"))
317 FT_start = Std.feature_location_start(Martel.Re(r".{6}"))
318 FT_end = Std.feature_location_end(Martel.Re(r".{6}"))
319 FT_desc = Std.feature_description(Martel.UntilEol())
320
321 FT_range = Martel.Str("FT ") + \
322 FT_name + \
323 Martel.Str(" ") + \
324 FT_start + \
325 Martel.Str(" ") + \
326 FT_end + \
327 Martel.Opt(Martel.Str(" ") + \
328 FT_desc) + \
329 Martel.AnyEol()
330
331 FT_continuation = Martel.Str("FT ") + \
332 FT_desc + \
333 Martel.AnyEol()
334
335 FT = Std.feature(FT_range + Martel.Rep(FT_continuation),
336 {"location-style": "sp"})
337
338
339
340 feature_block = Std.feature_block(Martel.Rep1(FT),
341 {"style": "swissprot"})
342
343
344
345
346
347
348
349 SQ = Martel.Group("SQ",
350 Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \
351 " +(?P<molecular_weight>\d+) MW;" \
352 " +(?P<crc?type=32>\w+) CRC32;\R")
353 )
354
355
356 SQ_data = Martel.Str(" ") + \
357 Std.sequence(Martel.UntilEol()) + \
358 Martel.AnyEol()
359
360
361
362
363 sequence = Std.sequence_block(SQ + Martel.Rep(SQ_data),
364 {"alphabet": "iupac-ambiguous-protein"})
365
366
367
368 end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol())
369
370
371
372 record = Std.record(
373 ID +
374 AC_block +
375 DT_created +
376 DT_seq_update +
377 DT_ann_update +
378 Martel.Opt(DE_block) +
379 Martel.Opt(GN_block) +
380 Martel.Opt(OS_block) +
381 Martel.Opt(OG_block) +
382 Martel.Opt(OC_block) +
383 Martel.Group("OX_block", Martel.NullOp()) +
384 Martel.Group("reference_block", Martel.Rep(reference)) +
385 comment +
386 Martel.Opt(DR_block) +
387 Martel.Opt(KW_block) +
388 Martel.Opt(feature_block) +
389 sequence +
390 end,
391 {"format": "swissprot/38"})
392
393
394 format_expression = Martel.Group("dataset", Martel.Rep1(record),
395 {"format": "swissprot/38"})
396
397 format = Martel.ParseRecords("dataset", {"format": "swissprot/38"},
398 record, RecordReader.EndsWith, ("//\n",) )
399
400 if __name__ == "__main__":
401 exp = Martel.select_names(format, ("entry_name", "sequence"))
402 parser = exp.make_parser()
403 parser.parseFile(open("/home/dalke/ftps/swissprot/sprot38.dat"))
404