Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This provides code for a general Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15  observation    A feature vector of discrete data. 
 16  class          A possible classification for an observation. 
 17   
 18   
 19  Classes: 
 20  NaiveBayes     Holds information for a naive Bayes classifier. 
 21   
 22  Functions: 
 23  train          Train a new naive Bayes classifier. 
 24  calculate      Calculate the probabilities of each class, given an observation. 
 25  classify       Classify an observation into a class. 
 26   
 27  """ 
 28  # To Do: 
 29  # add code to help discretize data 
 30  # use objects 
 31   
 32  try: 
 33      from Numeric import * 
 34  except ImportError, x: 
 35      raise ImportError, "This module requires Numeric (precursor to NumPy)" 
 36   
 37  from Bio import mathfns, listfns 
 38   
39 -class NaiveBayes:
40 """Holds information for a NaiveBayes classifier. 41 42 Members: 43 classes List of the possible classes of data. 44 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim) 45 p_prior List of the prior probabilities for every class. 46 dimensionality Dimensionality of the data. 47 48 """
49 - def __init__(self):
50 self.classes = [] 51 self.p_conditional = None 52 self.p_prior = [] 53 self.dimensionality = None
54
55 -def calculate(nb, observation, scale=0):
56 """calculate(nb, observation[, scale]) -> probability dict 57 58 Calculate log P(class|observation) for each class. nb is a NaiveBayes 59 classifier that has been trained. observation is a list representing 60 the observed data. scale is whether the probability should be 61 scaled by P(observation). By default, no scaling is done. The return 62 value is a dictionary where the keys is the class and the value is the 63 log probability of the class. 64 65 """ 66 # P(class|observation) = P(observation|class)*P(class)/P(observation) 67 # Taking the log: 68 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 69 70 # Make sure the observation has the right dimensionality. 71 if len(observation) != nb.dimensionality: 72 raise ValueError, "observation in %d dimension, but classifier in %d" \ 73 % (len(observation), nb.dimensionality) 74 75 # Calculate log P(observation|class) for every class. 76 lp_observation_class = [] # list of log P(observation|class) 77 for i in range(len(nb.classes)): 78 # log P(observation|class) = SUM_i log P(observation_i|class) 79 probs = [None] * len(observation) 80 for j in range(len(observation)): 81 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 82 lprobs = [mathfns.safe_log(x, -10000) for x in probs] 83 lprob = sum(lprobs) 84 lp_observation_class.append(lprob) 85 86 # Calculate log P(class). 87 lp_prior = map(math.log, nb.p_prior) 88 89 # Calculate log P(observation). 90 lp_observation = 0.0 # P(observation) 91 if scale: # Only calculate this if requested. 92 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 93 obs = zeros(len(nb.classes), Float32) 94 for i in range(len(nb.classes)): 95 obs[i] = mathfns.safe_exp(lp_prior[i]+lp_observation_class[i], 96 under=1E-300) 97 lp_observation = math.log(sum(obs)) 98 99 # Calculate log P(class|observation). 100 lp_class_observation = {} # Dict of class : log P(class|observation) 101 for i in range(len(nb.classes)): 102 lp_class_observation[nb.classes[i]] = \ 103 lp_observation_class[i] + lp_prior[i] - lp_observation 104 105 return lp_class_observation
106
107 -def classify(nb, observation):
108 """classify(nb, observation) -> class 109 110 Classify an observation into a class. 111 112 """ 113 # The class is the one with the highest probability. 114 probs = calculate(nb, observation, scale=0) 115 max_prob = max_class = None 116 for klass in nb.classes: 117 if max_prob is None or probs[klass] > max_prob: 118 max_prob, max_class = probs[klass], klass 119 return max_class
120
121 -def train(training_set, results, priors=None, typecode=None):
122 """train(training_set, results[, priors]) -> NaiveBayes 123 124 Train a naive bayes classifier on a training set. training_set is a 125 list of observations. results is a list of the class assignments 126 for each observation. Thus, training_set and results must be the same 127 length. priors is an optional dictionary specifying the prior 128 probabilities for each type of result. If not specified, the priors 129 will be estimated from the training results. 130 131 """ 132 if not len(training_set): 133 raise ValueError, "No data in the training set." 134 if len(training_set) != len(results): 135 raise ValueError, "training_set and results should be parallel lists." 136 137 # If no typecode is specified, try to pick a reasonable one. If 138 # training_set is a Numeric array, then use that typecode. 139 # Otherwise, choose a reasonable default. 140 # XXX NOT IMPLEMENTED 141 142 # Check to make sure each vector in the training set has the same 143 # dimensionality. 144 dimensions = [len(x) for x in training_set] 145 if min(dimensions) != max(dimensions): 146 raise ValueError, "observations have different dimensionality" 147 148 nb = NaiveBayes() 149 nb.dimensionality = dimensions[0] 150 151 # Get a list of all the classes. 152 nb.classes = listfns.items(results) 153 nb.classes.sort() # keep it tidy 154 155 # Estimate the prior probabilities for the classes. 156 if priors is not None: 157 percs = priors 158 else: 159 percs = listfns.contents(results) 160 nb.p_prior = zeros(len(nb.classes), Float16) 161 for i in range(len(nb.classes)): 162 nb.p_prior[i] = percs[nb.classes[i]] 163 164 # Collect all the observations in class. For each class, make a 165 # matrix of training instances versus dimensions. I might be able 166 # to optimize this with Numeric, if the training_set parameter 167 # were guaranteed to be a matrix. However, this may not be the 168 # case, because the client may be hacking up a sparse matrix or 169 # something. 170 c2i = listfns.itemindex(nb.classes) # class to index of class 171 observations = [[] for c in nb.classes] # separate observations by class 172 for i in range(len(results)): 173 klass, obs = results[i], training_set[i] 174 observations[c2i[klass]].append(obs) 175 # Now make the observations Numeric matrics. 176 for i in range(len(observations)): 177 # XXX typecode must be specified! 178 observations[i] = asarray(observations[i], typecode) 179 180 # Calculate P(value|class,dim) for every class. 181 # This is a good loop to optimize. 182 nb.p_conditional = [] 183 for i in range(len(nb.classes)): 184 class_observations = observations[i] # observations for this class 185 nb.p_conditional.append([None] * nb.dimensionality) 186 for j in range(nb.dimensionality): 187 # Collect all the values in this dimension. 188 values = class_observations[:, j] 189 190 # Add pseudocounts here. This needs to be parameterized. 191 #values = list(values) + range(len(nb.classes)) # XXX add 1 192 193 # Estimate P(value|class,dim) 194 nb.p_conditional[i][j] = listfns.contents(values) 195 return nb
196