1 """Hold GenBank data in a straightforward format.
2
3 classes:
4 o Record - All of the information in a GenBank record.
5 o Reference - hold reference data for a record.
6 o Feature - Hold the information in a Feature Table.
7 o Qualifier - Qualifiers on a Feature.
8 """
9
10 import Bio.GenBank
11
13 """Write a line of GenBank info that can wrap over multiple lines.
14
15 This takes a line of information which can potentially wrap over
16 multiple lines, and breaks it up with carriage returns and
17 indentation so it fits properly into a GenBank record.
18
19 Arguments:
20
21 o information - The string holding the information we want
22 wrapped in GenBank method.
23
24 o indent - The indentation on the lines we are writing.
25
26 o wrap_space - Whether or not to wrap only on spaces in the
27 information.
28
29 o split_char - A specific character to split the lines on. By default
30 spaces are used.
31 """
32 info_length = Record.GB_LINE_LENGTH - indent
33
34 if wrap_space:
35 info_parts = information.split(split_char)
36 else:
37 cur_pos = 0
38 info_parts = []
39 while cur_pos < len(information):
40 info_parts.append(information[cur_pos: cur_pos + info_length])
41 cur_pos += info_length
42
43
44 output_parts = []
45 cur_part = ""
46 for info_part in info_parts:
47 if len(cur_part) + 1 + len(info_part) > info_length:
48 if cur_part:
49 if split_char != " ":
50 cur_part += split_char
51 output_parts.append(cur_part)
52 cur_part = info_part
53 else:
54 if cur_part == "":
55 cur_part = info_part
56 else:
57 cur_part += split_char + info_part
58
59
60 if cur_part:
61 output_parts.append(cur_part)
62
63
64 output_info = output_parts[0] + "\n"
65 for output_part in output_parts[1:]:
66 output_info += " " * indent + output_part + "\n"
67
68 return output_info
69
71 """Write out information with the specified indent.
72
73 Unlike _wrapped_genbank, this function makes no attempt to wrap
74 lines -- it assumes that the information already has newlines in the
75 appropriate places, and will add the specified indent to the start of
76 each line.
77 """
78
79 info_parts = information.split("\n")
80
81
82 output_info = info_parts[0] + "\n"
83 for info_part in info_parts[1:]:
84 output_info += " " * indent + info_part + "\n"
85
86 return output_info
87
89 """Hold GenBank information in a format similar to the original record.
90
91 The Record class is meant to make data easy to get to when you are
92 just interested in looking at GenBank data.
93
94 Attributes:
95 o locus - The name specified after the LOCUS keyword in the GenBank
96 record. This may be the accession number, or a clone id or something else.
97 o size - The size of the record.
98 o residue_type - The type of residues making up the sequence in this
99 record. Normally something like RNA, DNA or PROTEIN, but may be as
100 esoteric as 'ss-RNA circular'.
101 o data_file_division - The division this record is stored under in
102 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
103 o date - The date of submission of the record, in a form like '28-JUL-1998'
104 o accession - list of all accession numbers for the sequence.
105 o nid - Nucleotide identifier number.
106 o pid - Proteint identifier number
107 o version - The accession number + version (ie. AB01234.2)
108 o db_source - Information about the database the record came from
109 o gi - The NCBI gi identifier for the record.
110 o keywords - A list of keywords related to the record.
111 o segment - If the record is one of a series, this is info about which
112 segment this record is (something like '1 of 6').
113 o source - The source of material where the sequence came from.
114 o organism - The genus and species of the organism (ie. 'Homo sapiens')
115 o taxonomy - A listing of the taxonomic classification of the organism,
116 starting general and getting more specific.
117 o references - A list of Reference objects.
118 o comment - Text with any kind of comment about the record.
119 o features - A listing of Features making up the feature table.
120 o base_counts - A string with the counts of bases for the sequence.
121 o origin - A string specifying info about the origin of the sequence.
122 o sequence - A string with the sequence itself.
123 o contig - A string of location information for a CONTIG in a RefSeq
124 file.
125 """
126
127 GB_LINE_LENGTH = 79
128 GB_BASE_INDENT = 12
129 GB_FEATURE_INDENT = 21
130 GB_INTERNAL_INDENT = 2
131 GB_OTHER_INTERNAL_INDENT = 3
132 GB_FEATURE_INTERNAL_INDENT = 5
133 GB_SEQUENCE_INDENT = 9
134
135 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
136 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
137 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
138 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
139 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
140 "s"
141
142 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
143 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
144 str(GB_FEATURE_INDENT -
145 GB_FEATURE_INTERNAL_INDENT) + "s"
146 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
147
174
211
213 """Provide the output string for the LOCUS line.
214 """
215 output = "LOCUS"
216 output += " " * 7
217 output += "%-9s" % self.locus
218 output += " "
219 output += "%7s" % self.size
220 if self.residue_type.find("PROTEIN") >= 0:
221 output += " aa"
222 else:
223 output += " bp "
224
225
226
227 if self.residue_type.find("circular") >= 0:
228 output += "%17s" % self.residue_type
229
230 elif self.residue_type.find("-") >= 0:
231 output += "%7s" % self.residue_type
232 output += " " * 10
233 else:
234 output += " " * 3
235 output += "%-4s" % self.residue_type
236 output += " " * 10
237
238 output += " " * 2
239 output += "%3s" % self.data_file_division
240 output += " " * 7
241 output += "%11s" % self.date
242 output += "\n"
243 return output
244
251
265
267 """Output for the VERSION line.
268 """
269 output = Record.BASE_FORMAT % "VERSION"
270 output += self.version
271 output += " GI:"
272 output += "%s\n" % self.gi
273 return output
274
276 """Output for the NID line. Use of NID is obsolete in GenBank files.
277 """
278 if self.nid:
279 output = Record.BASE_FORMAT % "NID"
280 output += "%s\n" % self.nid
281 else:
282 output = ""
283 return output
284
286 """Output for PID line. Presumedly, PID usage is also obsolete.
287 """
288 if self.pid:
289 output = Record.BASE_FORMAT % "PID"
290 output += "%s\n" % self.pid
291 else:
292 output = ""
293 return output
294
312
314 """Output for DBSOURCE line.
315 """
316 if self.db_source:
317 output = Record.BASE_FORMAT % "DBSOURCE"
318 output += "%s\n" % self.db_source
319 else:
320 output = ""
321 return output
322
331
338
354
364
366 """Output for the FEATURES line.
367 """
368 output = ""
369 if len(self.features) > 0:
370 output += Record.BASE_FEATURE_FORMAT % "FEATURES"
371 output += "Location/Qualifiers\n"
372 return output
373
375 """Output for the BASE COUNT line with base information.
376 """
377 output = ""
378 if self.base_counts:
379 output += Record.BASE_FORMAT % "BASE COUNT "
380
381 count_parts = self.base_counts.split(" ")
382 while '' in count_parts:
383 count_parts.remove('')
384
385
386 if len(count_parts) % 2 == 0:
387 while len(count_parts) > 0:
388 count_info = count_parts.pop(0)
389 count_type = count_parts.pop(0)
390
391 output += "%7s %s" % (count_info, count_type)
392
393
394
395 else:
396 output += self.base_counts
397 output += "\n"
398 return output
399
413
415 """Output for all of the sequence.
416 """
417 output = ""
418 if self.sequence:
419 cur_seq_pos = 0
420 while cur_seq_pos < len(self.sequence):
421 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
422
423 for section in range(6):
424 start_pos = cur_seq_pos + section * 10
425 end_pos = start_pos + 10
426 seq_section = self.sequence[start_pos:end_pos]
427 output += " %s" % seq_section.lower()
428
429
430 if end_pos > len(self.sequence):
431 break
432
433 output += "\n"
434 cur_seq_pos += 60
435 return output
436
446
448 """Hold information from a GenBank reference.
449
450 Attributes:
451 o number - The number of the reference in the listing of references.
452 o bases - The bases in the sequence the reference refers to.
453 o authors - String with all of the authors.
454 o consrtm - Consortium the authors belong to.
455 o title - The title of the reference.
456 o journal - Information about the journal where the reference appeared.
457 o medline_id - The medline id for the reference.
458 o pubmed_id - The pubmed_id for the reference.
459 o remark - Free-form remarks about the reference.
460 """
471
483
485 """Output for REFERENCE lines.
486 """
487 output = Record.BASE_FORMAT % "REFERENCE"
488 if self.number:
489 if self.bases:
490 output += "%-3s" % self.number
491 output += "%s" % self.bases
492 else:
493 output += "%s" % self.number
494
495 output += "\n"
496 return output
497
506
515
524
533
542
551
560
562 """Hold information about a Feature in the Feature Table of GenBank record.
563
564 Attributes:
565 o key - The key name of the featue (ie. source)
566 o location - The string specifying the location of the feature.
567 o qualfiers - A listing Qualifier objects in the feature.
568 """
570 self.key = ''
571 self.location = ''
572 self.qualifiers = []
573
591
593 """Hold information about a qualifier in a GenBank feature.
594
595 Attributes:
596 o key - The key name of the qualifier (ie. /organism=)
597 o value - The value of the qualifier ("Dictyostelium discoideum").
598 """
602