Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  """Hold GenBank data in a straightforward format. 
  2   
  3  classes: 
  4  o Record - All of the information in a GenBank record. 
  5  o Reference - hold reference data for a record. 
  6  o Feature - Hold the information in a Feature Table. 
  7  o Qualifier - Qualifiers on a Feature. 
  8  """ 
  9  # local stuff 
 10  import Bio.GenBank 
 11   
12 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
13 """Write a line of GenBank info that can wrap over multiple lines. 14 15 This takes a line of information which can potentially wrap over 16 multiple lines, and breaks it up with carriage returns and 17 indentation so it fits properly into a GenBank record. 18 19 Arguments: 20 21 o information - The string holding the information we want 22 wrapped in GenBank method. 23 24 o indent - The indentation on the lines we are writing. 25 26 o wrap_space - Whether or not to wrap only on spaces in the 27 information. 28 29 o split_char - A specific character to split the lines on. By default 30 spaces are used. 31 """ 32 info_length = Record.GB_LINE_LENGTH - indent 33 34 if wrap_space: 35 info_parts = information.split(split_char) 36 else: 37 cur_pos = 0 38 info_parts = [] 39 while cur_pos < len(information): 40 info_parts.append(information[cur_pos: cur_pos + info_length]) 41 cur_pos += info_length 42 43 # first get the information string split up by line 44 output_parts = [] 45 cur_part = "" 46 for info_part in info_parts: 47 if len(cur_part) + 1 + len(info_part) > info_length: 48 if cur_part: 49 if split_char != " ": 50 cur_part += split_char 51 output_parts.append(cur_part) 52 cur_part = info_part 53 else: 54 if cur_part == "": 55 cur_part = info_part 56 else: 57 cur_part += split_char + info_part 58 59 # add the last bit of information to the output 60 if cur_part: 61 output_parts.append(cur_part) 62 63 # now format the information string for return 64 output_info = output_parts[0] + "\n" 65 for output_part in output_parts[1:]: 66 output_info += " " * indent + output_part + "\n" 67 68 return output_info
69
70 -def _indent_genbank(information, indent):
71 """Write out information with the specified indent. 72 73 Unlike _wrapped_genbank, this function makes no attempt to wrap 74 lines -- it assumes that the information already has newlines in the 75 appropriate places, and will add the specified indent to the start of 76 each line. 77 """ 78 # split the info into lines based on line breaks 79 info_parts = information.split("\n") 80 81 # the first line will have no indent 82 output_info = info_parts[0] + "\n" 83 for info_part in info_parts[1:]: 84 output_info += " " * indent + info_part + "\n" 85 86 return output_info
87
88 -class Record:
89 """Hold GenBank information in a format similar to the original record. 90 91 The Record class is meant to make data easy to get to when you are 92 just interested in looking at GenBank data. 93 94 Attributes: 95 o locus - The name specified after the LOCUS keyword in the GenBank 96 record. This may be the accession number, or a clone id or something else. 97 o size - The size of the record. 98 o residue_type - The type of residues making up the sequence in this 99 record. Normally something like RNA, DNA or PROTEIN, but may be as 100 esoteric as 'ss-RNA circular'. 101 o data_file_division - The division this record is stored under in 102 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 103 o date - The date of submission of the record, in a form like '28-JUL-1998' 104 o accession - list of all accession numbers for the sequence. 105 o nid - Nucleotide identifier number. 106 o pid - Proteint identifier number 107 o version - The accession number + version (ie. AB01234.2) 108 o db_source - Information about the database the record came from 109 o gi - The NCBI gi identifier for the record. 110 o keywords - A list of keywords related to the record. 111 o segment - If the record is one of a series, this is info about which 112 segment this record is (something like '1 of 6'). 113 o source - The source of material where the sequence came from. 114 o organism - The genus and species of the organism (ie. 'Homo sapiens') 115 o taxonomy - A listing of the taxonomic classification of the organism, 116 starting general and getting more specific. 117 o references - A list of Reference objects. 118 o comment - Text with any kind of comment about the record. 119 o features - A listing of Features making up the feature table. 120 o base_counts - A string with the counts of bases for the sequence. 121 o origin - A string specifying info about the origin of the sequence. 122 o sequence - A string with the sequence itself. 123 o contig - A string of location information for a CONTIG in a RefSeq 124 file. 125 """ 126 # constants for outputting GenBank information 127 GB_LINE_LENGTH = 79 128 GB_BASE_INDENT = 12 129 GB_FEATURE_INDENT = 21 130 GB_INTERNAL_INDENT = 2 131 GB_OTHER_INTERNAL_INDENT = 3 132 GB_FEATURE_INTERNAL_INDENT = 5 133 GB_SEQUENCE_INDENT = 9 134 135 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 136 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 137 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 138 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 139 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 140 "s" 141 142 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 143 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 144 str(GB_FEATURE_INDENT - 145 GB_FEATURE_INTERNAL_INDENT) + "s" 146 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 147
148 - def __init__(self):
149 self.locus = '' 150 self.size = '' 151 self.residue_type = '' 152 self.data_file_division = '' 153 self.date = '' 154 self.definition = '' 155 self.accession = [] 156 self.nid = '' 157 self.pid = '' 158 self.version = '' 159 self.db_source = '' 160 self.gi = '' 161 self.keywords = [] 162 self.segment = '' 163 self.source = '' 164 self.organism = '' 165 self.taxonomy = [] 166 self.references = [] 167 self.comment = '' 168 self.features = [] 169 self.base_counts = '' 170 self.origin = '' 171 self.sequence = '' 172 self.contig = '' 173 self.primary=[]
174
175 - def __str__(self):
176 """Provide a GenBank formatted output option for a Record. 177 178 The objective of this is to provide an easy way to read in a GenBank 179 record, modify it somehow, and then output it in 'GenBank format.' 180 We are striving to make this work so that a parsed Record that is 181 output using this function will look exactly like the original 182 record. 183 184 Much of the output is based on format description info at: 185 186 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 187 """ 188 output = self._locus_line() 189 output += self._definition_line() 190 output += self._accession_line() 191 output += self._version_line() 192 output += self._nid_line() 193 output += self._pid_line() 194 output += self._keywords_line() 195 output += self._db_source_line() 196 output += self._segment_line() 197 output += self._source_line() 198 output += self._organism_line() 199 for reference in self.references: 200 output += str(reference) 201 output += self._comment_line() 202 output += self._features_line() 203 for feature in self.features: 204 output += str(feature) 205 output += self._base_count_line() 206 output += self._origin_line() 207 output += self._sequence_line() 208 output += self._contig_line() 209 output += "//" 210 return output
211
212 - def _locus_line(self):
213 """Provide the output string for the LOCUS line. 214 """ 215 output = "LOCUS" 216 output += " " * 7 # 6-12 spaces 217 output += "%-9s" % self.locus 218 output += " " # 22 space 219 output += "%7s" % self.size 220 if self.residue_type.find("PROTEIN") >= 0: 221 output += " aa" 222 else: 223 output += " bp " 224 225 # treat circular types differently, since they'll have long residue 226 # types 227 if self.residue_type.find("circular") >= 0: 228 output += "%17s" % self.residue_type 229 # second case: ss-DNA types of records 230 elif self.residue_type.find("-") >= 0: 231 output += "%7s" % self.residue_type 232 output += " " * 10 # spaces for circular 233 else: 234 output += " " * 3 # spaces for stuff like ss- 235 output += "%-4s" % self.residue_type 236 output += " " * 10 # spaces for circular 237 238 output += " " * 2 239 output += "%3s" % self.data_file_division 240 output += " " * 7 # spaces for 56-63 241 output += "%11s" % self.date 242 output += "\n" 243 return output
244
245 - def _definition_line(self):
246 """Provide output for the DEFINITION line. 247 """ 248 output = Record.BASE_FORMAT % "DEFINITION" 249 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 250 return output
251
252 - def _accession_line(self):
253 """Output for the ACCESSION line. 254 """ 255 output = Record.BASE_FORMAT % "ACCESSION" 256 257 acc_info = "" 258 for accession in self.accession: 259 acc_info += "%s " % accession 260 # strip off an extra space at the end 261 acc_info = acc_info.rstrip() 262 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 263 264 return output
265
266 - def _version_line(self):
267 """Output for the VERSION line. 268 """ 269 output = Record.BASE_FORMAT % "VERSION" 270 output += self.version 271 output += " GI:" 272 output += "%s\n" % self.gi 273 return output
274
275 - def _nid_line(self):
276 """Output for the NID line. Use of NID is obsolete in GenBank files. 277 """ 278 if self.nid: 279 output = Record.BASE_FORMAT % "NID" 280 output += "%s\n" % self.nid 281 else: 282 output = "" 283 return output
284
285 - def _pid_line(self):
286 """Output for PID line. Presumedly, PID usage is also obsolete. 287 """ 288 if self.pid: 289 output = Record.BASE_FORMAT % "PID" 290 output += "%s\n" % self.pid 291 else: 292 output = "" 293 return output
294
295 - def _keywords_line(self):
296 """Output for the KEYWORDS line. 297 """ 298 output = "" 299 if len(self.keywords) >= 0: 300 output += Record.BASE_FORMAT % "KEYWORDS" 301 keyword_info = "" 302 for keyword in self.keywords: 303 keyword_info += "%s; " % keyword 304 # replace the ; at the end with a period 305 keyword_info = keyword_info[:-2] 306 keyword_info += "." 307 308 output += _wrapped_genbank(keyword_info, 309 Record.GB_BASE_INDENT) 310 311 return output
312
313 - def _db_source_line(self):
314 """Output for DBSOURCE line. 315 """ 316 if self.db_source: 317 output = Record.BASE_FORMAT % "DBSOURCE" 318 output += "%s\n" % self.db_source 319 else: 320 output = "" 321 return output
322
323 - def _segment_line(self):
324 """Output for the SEGMENT line. 325 """ 326 output = "" 327 if self.segment: 328 output += Record.BASE_FORMAT % "SEGMENT" 329 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 330 return output
331
332 - def _source_line(self):
333 """Output for SOURCE line on where the sample came from. 334 """ 335 output = Record.BASE_FORMAT % "SOURCE" 336 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 337 return output
338
339 - def _organism_line(self):
340 """Output for ORGANISM line with taxonomy info. 341 """ 342 output = Record.INTERNAL_FORMAT % "ORGANISM" 343 output += "%s\n" % self.organism 344 output += " " * Record.GB_BASE_INDENT 345 taxonomy_info = "" 346 for tax in self.taxonomy: 347 taxonomy_info += "%s; " % tax 348 # replace the ; at the end with a period 349 taxonomy_info = taxonomy_info[:-2] 350 taxonomy_info += "." 351 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 352 353 return output
354
355 - def _comment_line(self):
356 """Output for the COMMENT lines. 357 """ 358 output = "" 359 if self.comment: 360 output += Record.BASE_FORMAT % "COMMENT" 361 output += _indent_genbank(self.comment, 362 Record.GB_BASE_INDENT) 363 return output
364
365 - def _features_line(self):
366 """Output for the FEATURES line. 367 """ 368 output = "" 369 if len(self.features) > 0: 370 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 371 output += "Location/Qualifiers\n" 372 return output
373
374 - def _base_count_line(self):
375 """Output for the BASE COUNT line with base information. 376 """ 377 output = "" 378 if self.base_counts: 379 output += Record.BASE_FORMAT % "BASE COUNT " 380 # split up the base counts into their individual parts 381 count_parts = self.base_counts.split(" ") 382 while '' in count_parts: 383 count_parts.remove('') 384 # deal with the standard case, with a normal origin line 385 # like: 474 a 356 c 428 g 364 t 386 if len(count_parts) % 2 == 0: 387 while len(count_parts) > 0: 388 count_info = count_parts.pop(0) 389 count_type = count_parts.pop(0) 390 391 output += "%7s %s" % (count_info, count_type) 392 # deal with ugly ORIGIN lines like: 393 # 1311257 a2224835 c2190093 g1309889 t 394 # by just outputting the raw information 395 else: 396 output += self.base_counts 397 output += "\n" 398 return output
399
400 - def _origin_line(self):
401 """Output for the ORIGIN line 402 """ 403 output = "" 404 # only output the ORIGIN line if we have a sequence 405 if self.sequence: 406 output += Record.BASE_FORMAT % "ORIGIN" 407 if self.origin: 408 output += _wrapped_genbank(self.origin, 409 Record.GB_BASE_INDENT) 410 else: 411 output += "\n" 412 return output
413
414 - def _sequence_line(self):
415 """Output for all of the sequence. 416 """ 417 output = "" 418 if self.sequence: 419 cur_seq_pos = 0 420 while cur_seq_pos < len(self.sequence): 421 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 422 423 for section in range(6): 424 start_pos = cur_seq_pos + section * 10 425 end_pos = start_pos + 10 426 seq_section = self.sequence[start_pos:end_pos] 427 output += " %s" % seq_section.lower() 428 429 # stop looping if we are out of sequence 430 if end_pos > len(self.sequence): 431 break 432 433 output += "\n" 434 cur_seq_pos += 60 435 return output
436
437 - def _contig_line(self):
438 """Output for CONTIG location information from RefSeq. 439 """ 440 output = "" 441 if self.contig: 442 output += Record.BASE_FORMAT % "CONTIG" 443 output += _wrapped_genbank(self.contig, 444 Record.GB_BASE_INDENT, split_char = ',') 445 return output
446
447 -class Reference:
448 """Hold information from a GenBank reference. 449 450 Attributes: 451 o number - The number of the reference in the listing of references. 452 o bases - The bases in the sequence the reference refers to. 453 o authors - String with all of the authors. 454 o consrtm - Consortium the authors belong to. 455 o title - The title of the reference. 456 o journal - Information about the journal where the reference appeared. 457 o medline_id - The medline id for the reference. 458 o pubmed_id - The pubmed_id for the reference. 459 o remark - Free-form remarks about the reference. 460 """
461 - def __init__(self):
462 self.number = '' 463 self.bases = '' 464 self.authors = '' 465 self.consrtm = '' 466 self.title = '' 467 self.journal = '' 468 self.medline_id = '' 469 self.pubmed_id = '' 470 self.remark = ''
471
472 - def __str__(self):
473 output = self._reference_line() 474 output += self._authors_line() 475 output += self._consrtm_line() 476 output += self._title_line() 477 output += self._journal_line() 478 output += self._medline_line() 479 output += self._pubmed_line() 480 output += self._remark_line() 481 482 return output
483
484 - def _reference_line(self):
485 """Output for REFERENCE lines. 486 """ 487 output = Record.BASE_FORMAT % "REFERENCE" 488 if self.number: 489 if self.bases: 490 output += "%-3s" % self.number 491 output += "%s" % self.bases 492 else: 493 output += "%s" % self.number 494 495 output += "\n" 496 return output
497
498 - def _authors_line(self):
499 """Output for AUTHORS information. 500 """ 501 output = "" 502 if self.authors: 503 output += Record.INTERNAL_FORMAT % "AUTHORS" 504 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 505 return output
506
507 - def _consrtm_line(self):
508 """Output for CONSRTM information. 509 """ 510 output = "" 511 if self.consrtm: 512 output += Record.INTERNAL_FORMAT % "CONSRTM" 513 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 514 return output
515
516 - def _title_line(self):
517 """Output for TITLE information. 518 """ 519 output = "" 520 if self.title: 521 output += Record.INTERNAL_FORMAT % "TITLE" 522 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 523 return output
524
525 - def _journal_line(self):
526 """Output for JOURNAL information. 527 """ 528 output = "" 529 if self.journal: 530 output += Record.INTERNAL_FORMAT % "JOURNAL" 531 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 532 return output
533
534 - def _medline_line(self):
535 """Output for MEDLINE information. 536 """ 537 output = "" 538 if self.medline_id: 539 output += Record.INTERNAL_FORMAT % "MEDLINE" 540 output += self.medline_id + "\n" 541 return output
542
543 - def _pubmed_line(self):
544 """Output for PUBMED information. 545 """ 546 output = "" 547 if self.pubmed_id: 548 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 549 output += self.pubmed_id + "\n" 550 return output
551
552 - def _remark_line(self):
553 """Output for REMARK information. 554 """ 555 output = "" 556 if self.remark: 557 output += Record.INTERNAL_FORMAT % "REMARK" 558 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 559 return output
560
561 -class Feature:
562 """Hold information about a Feature in the Feature Table of GenBank record. 563 564 Attributes: 565 o key - The key name of the featue (ie. source) 566 o location - The string specifying the location of the feature. 567 o qualfiers - A listing Qualifier objects in the feature. 568 """
569 - def __init__(self):
570 self.key = '' 571 self.location = '' 572 self.qualifiers = []
573
574 - def __str__(self):
575 output = Record.INTERNAL_FEATURE_FORMAT % self.key 576 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 577 split_char = ',') 578 for qualifier in self.qualifiers: 579 output += " " * Record.GB_FEATURE_INDENT 580 581 # determine whether we can wrap on spaces 582 space_wrap = 1 583 for no_space_key in \ 584 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 585 if qualifier.key.find(no_space_key) >= 0: 586 space_wrap = 0 587 588 output += _wrapped_genbank(qualifier.key + qualifier.value, 589 Record.GB_FEATURE_INDENT, space_wrap) 590 return output
591
592 -class Qualifier:
593 """Hold information about a qualifier in a GenBank feature. 594 595 Attributes: 596 o key - The key name of the qualifier (ie. /organism=) 597 o value - The value of the qualifier ("Dictyostelium discoideum"). 598 """
599 - def __init__(self):
600 self.key = '' 601 self.value = ''
602