Package Bio :: Package PDB :: Module parse_pdb_header'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.parse_pdb_header'

  1  #!/usr/bin/env python 
  2  # 
  3  # parse_pdb_header.py 
  4  # parses header of PDB files into a python dictionary. 
  5  # emerged from the Columba database project www.columba-db.de. 
  6  #  
  7  # author: Kristian Rother 
  8  #  
  9  # license: same as BioPython, read LICENSE.TXT from current BioPython release. 
 10  #  
 11  # last modified: 9.2.2004 
 12  # 
 13  # Added some small changes: the whole PDB file is not read in anymore, but just 
 14  # until the first ATOM record (faster). I also split parse_pdb_header into  
 15  # parse_pdb_header and parse_pdb_header_list, because parse_pdb_header_list 
 16  # can be more easily reused in PDBParser. 
 17  # 
 18  # Thomas, 19/03/04 
 19  # 
 20  # Renamed some clearly private functions to _something (ie. parse_pdb_header_list 
 21  # is now _parse_pdb_header_list) 
 22  # Thomas 9/05/04 
 23   
 24  __doc__="Parse the header of a PDB file." 
 25   
 26  import sys 
 27  import os,string,re 
 28  import urllib 
 29  import types 
 30   
 31   
32 -def _get_journal(inl):
33 # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7 34 journal="" 35 for l in inl: 36 if re.search("\AJRNL",l): 37 journal+=string.lower(l[19:72]) 38 journal=re.sub("\s\s+"," ",journal) 39 return journal
40
41 -def _get_references(inl):
42 # REMARK 1 REFERENCE 1 1CSE 11 43 # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12 44 references=[] 45 actref="" 46 for l in inl: 47 if re.search("\AREMARK 1",l): 48 if re.search("\AREMARK 1 REFERENCE",l): 49 if actref!="": 50 actref=re.sub("\s\s+"," ",actref) 51 if actref!=" ": 52 references.append(actref) 53 actref="" 54 else: 55 actref+=string.lower(l[19:72]) 56 57 if actref!="": 58 actref=re.sub("\s\s+"," ",actref) 59 if actref!=" ": 60 references.append(actref) 61 return references
62 63 64 # bring dates to format: 1909-01-08
65 -def _format_date(pdb_date):
66 """Converts dates from DD-Mon-YY to YYYY-MM-DD format.""" 67 date="" 68 year=int(pdb_date[7:]) 69 if year<50: 70 century=2000 71 else: 72 century=1900 73 date=str(century+year)+"-" 74 all_months=['xxx','Jan','Feb','Mar','Apr','May','Jun','Jul',\ 75 'Aug','Sep','Oct','Nov','Dec'] 76 month=str(all_months.index(pdb_date[3:6])) 77 if len(month)==1: 78 month = '0'+month 79 date = date+month+'-'+pdb_date[:2] 80 return date
81 82
83 -def _chop_end_codes(line):
84 """Chops lines ending with ' 1CSA 14' and the like.""" 85 import re 86 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
87
88 -def _chop_end_misc(line):
89 """Chops lines ending with ' 14-JUL-97 1CSA' and the like.""" 90 import re 91 return re.sub("\s\s\s\s+.*\Z","",line)
92
93 -def _nice_case(line):
94 """Makes A Lowercase String With Capitals.""" 95 import string 96 l=string.lower(line) 97 s="" 98 i=0 99 nextCap=1 100 while i<len(l): 101 c=l[i] 102 if c>='a' and c<='z' and nextCap: 103 c=string.upper(c) 104 nextCap=0 105 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\ 106 c=='-' or c=='_': 107 nextCap=1 108 s+=c 109 i+=1 110 return s
111
112 -def parse_pdb_header(file):
113 """ 114 Returns the header lines of a pdb file as a dictionary. 115 116 Dictionary keys are: head, deposition_date, release_date, structure_method, 117 resolution, structure_reference, journal_reference, author and 118 compound. 119 """ 120 header=[] 121 if type(file)==types.StringType: 122 f=open(file,'r') 123 else: 124 f=file 125 for l in f: 126 record_type=l[0:6] 127 if record_type=='ATOM ' or record_type=='HETATM' or record_type=='MODEL ': 128 break 129 else: 130 header.append(l) 131 f.close() 132 return _parse_pdb_header_list(header)
133
134 -def _parse_pdb_header_list(header):
135 # database fields 136 dict={'name':"", 137 'head':'', 138 'deposition_date' : "1909-01-08", 139 'release_date' : "1909-01-08", 140 'structure_method' : "unknown", 141 'resolution' : 0.0, 142 'structure_reference' : "unknown", 143 'journal_reference' : "unknown", 144 'author' : "", 145 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}} 146 147 dict['structure_reference'] = _get_references(header) 148 dict['journal_reference'] = _get_journal(header) 149 comp_molid="1" 150 src_molid="1" 151 last_comp_key="misc" 152 last_src_key="misc" 153 154 for hh in header: 155 h=re.sub("[\s\n\r]*\Z","",hh) # chop linebreaks off 156 key=re.sub("\s.+\s*","",h) 157 tail=re.sub("\A\w+\s+\d*\s*","",h) 158 # print key+":"+tail 159 160 # From here, all the keys from the header are being parsed 161 if key=="TITLE": 162 name=string.lower(_chop_end_codes(tail)) 163 if dict.has_key('name'): 164 dict['name'] += " "+name 165 else: 166 dict['name']=name 167 elif key=="HEADER": 168 rr=re.search("\d\d-\w\w\w-\d\d",tail) 169 if rr!=None: 170 dict['deposition_date']=_format_date(_nice_case(rr.group())) 171 head=string.lower(_chop_end_misc(tail)) 172 dict['head']=head 173 elif key=="COMPND": 174 tt=string.lower(re.sub("\;\s*\Z","",_chop_end_codes(tail))) 175 # look for E.C. numbers in COMPND lines 176 rec = re.search('\d+\.\d+\.\d+\.\d+',tt) 177 if rec: 178 dict['compound'][comp_molid]['ec_number']=rec.group() 179 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt) 180 tok=tt.split(":") 181 if len(tok)>=2: 182 ckey=tok[0] 183 cval=re.sub("\A\s*","",tok[1]) 184 if ckey=='mol_id': 185 dict['compound'][cval]={'misc':''} 186 comp_molid=cval 187 last_comp_key="misc" 188 else: 189 dict['compound'][comp_molid][ckey]=cval 190 last_comp_key=ckey 191 else: 192 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" " 193 elif key=="SOURCE": 194 tt=string.lower(re.sub("\;\s*\Z","",_chop_end_codes(tail))) 195 tok=tt.split(":") 196 # print tok 197 if len(tok)>=2: 198 ckey=tok[0] 199 cval=re.sub("\A\s*","",tok[1]) 200 if ckey=='mol_id': 201 dict['source'][cval]={'misc':''} 202 comp_molid=cval 203 last_src_key="misc" 204 else: 205 dict['source'][comp_molid][ckey]=cval 206 last_src_key=ckey 207 else: 208 dict['source'][comp_molid][last_src_key]+=tok[0]+" " 209 elif key=="KEYWDS": 210 kwd=string.lower(_chop_end_codes(tail)) 211 if dict.has_key('keywords'): 212 dict['keywords']+=" "+kwd 213 else: 214 dict['keywords']=kwd 215 elif key=="EXPDTA": 216 expd=_chop_end_codes(tail) 217 # chop junk at end of lines for some structures 218 expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd) 219 # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr' 220 # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction' 221 dict['structure_method']=string.lower(expd) 222 elif key=="CAVEAT": 223 # make Annotation entries out of these!!! 224 pass 225 elif key=="REVDAT": 226 rr=re.search("\d\d-\w\w\w-\d\d",tail) 227 if rr!=None: 228 dict['release_date']=_format_date(_nice_case(rr.group())) 229 elif key=="JRNL": 230 # print key,tail 231 if dict.has_key('journal'): 232 dict['journal']+=tail 233 else: 234 dict['journal']=tail 235 elif key=="AUTHOR": 236 auth = _nice_case(_chop_end_codes(tail)) 237 if dict.has_key('author'): 238 dict['author']+=auth 239 else: 240 dict['author']=auth 241 elif key=="REMARK": 242 if re.search("REMARK 2 RESOLUTION.",hh): 243 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.",'',hh)) 244 r=re.sub("\s+ANGSTROM.*","",r) 245 try: 246 dict['resolution']=float(r) 247 except: 248 #print 'nonstandard resolution',r 249 dict['resolution']=None 250 else: 251 # print key 252 pass 253 if dict['structure_method']=='unknown': 254 if dict['resolution']>0.0: dict['structure_method']='x-ray diffraction' 255 return dict
256 257 if __name__=='__main__': 258 """ 259 Reads a PDB file passed as argument, parses its header, extracts 260 some data and returns it as a dictionary. 261 """ 262 filename = sys.argv[1] 263 file = open(filename,'r') 264 dict = parse_pdb_header(file) 265 266 # print the dictionary 267 for d in dict.keys(): 268 print "-"*40 269 print d 270 print dict[d] 271