Hierarchical Clustering


/ Published in: Python
Save to your folder(s)

Helps if you have a distance function. This will be implementation specific, but this is the main algo


Copy this code and paste it in your HTML
  1. def hcluster(rows,distance=pearson):
  2. distances={}
  3. currentclustid=-1
  4.  
  5. # Clusters are initially just the rows
  6. clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
  7.  
  8. while len(clust)>1:
  9. lowestpair=(0,1)
  10. closest=distance(clust[0].vec,clust[1].vec)
  11.  
  12. # loop through every pair looking for the smallest distance
  13. for i in range(len(clust)):
  14. for j in range(i+1,len(clust)):
  15. # distances is the cache of distance calculations
  16. if (clust[i].id,clust[j].id) not in distances:
  17. distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
  18.  
  19. d=distances[(clust[i].id,clust[j].id)]
  20.  
  21. if d<closest:
  22. closest=d
  23. lowestpair=(i,j)
  24.  
  25. # calculate the average of the two clusters
  26. mergevec=[
  27. (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0
  28. for i in range(len(clust[0].vec))]
  29.  
  30. # create the new cluster
  31. newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
  32. right=clust[lowestpair[1]],
  33. distance=closest,id=currentclustid)
  34.  
  35. # cluster ids that weren't in the original set are negative
  36. currentclustid-=1
  37. del clust[lowestpair[1]]
  38. del clust[lowestpair[0]]
  39. clust.append(newcluster)
  40.  
  41. return clust[0]

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.