Numpy and SciPy are fast when the function is wrapped over the C implementation. But it can get very slow for a python implementation with nested loops.
The default implementation scipy.spatial.distance.pdist use nested for loops to calculate pairwise distance, which is not optimal.
Here is my version of the code to calculate the pairwise JS divergence using Numpy and Scipy. The input is a matrix whose rows are the observations.
In comparison to the pdist version, my code is 2-3 times faster. Here is the output when the main method of this script runs:
pdist took 2279.342 ms js_div_matrix took 600.961 ms
The source code is here, tested on python 2.6. Download the code here.
Smooth the input before calling the method! (i.e. No value zero is allowed in the input)
from scipy import *
from numpy import *
from normalisation import *
from scipy.spatial.distance import pdist,squareform
import time
import pdb
from scipy.stats.distributions import entropy
#efficnent js_div
def js_div_matrix(a):
a=array(a)
W=zeros((a.shape[0],a.shape[0]))
e=-entropy(a.transpose())
for i in range(a.shape[0]):
val_range=range(i+1,a.shape[0])
sumAB=tile(a[i,:],(a.shape[0]-i-1,1))+a[val_range,:]
result=0.5*(e[i]+e[val_range,:]-sum((sumAB)*log(((sumAB)/2)),1))
W[val_range,i]=result
W[i,val_range]=result
return W
def js_div(A,B):
half=(A+B)/2
return 0.5*kl_div(A,half)+0.5*kl_div(B,half)
def kl_div(A,B):
return sum(multiply(A,log(A/B)))
if __name__=="__main__":
a=l1(random.random((200,300)))
t1 = time.time()
W1 = squareform(pdist(a,js_div))
t2 = time.time()
print '%s took %0.3f ms' % (pdist.func_name, (t2-t1)*1000.0)
t1 = time.time()
W2=js_div_matrix(a)
t2 = time.time()
print '%s took %0.3f ms' % (js_div_matrix.func_name, (t2-t1)*1000.0)
Comments