Cluster Analysis in IronPython QuickStart Sample

Illustrates how to use the classes in the Extreme.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in IronPython.

View this sample in: C# Visual Basic F#

```Python
from System import Array

import numerics

from Extreme.Mathematics import *
from Extreme.Statistics import *
from Extreme.Statistics.Multivariate import *

# Demonstrates how to use classes that implement
# hierarchical and K-means clustering.

# This QuickStart Sample demonstrates how to run two
# common multivariate analysis techniques:
# hierarchical cluster analysis and K-means cluster analysis.
#
# The classes used in this sample reside in the
# Extreme.Statistics.Multivariate namespace..

# First, our dataset, which is from
#     Computer-Aided Multivariate Analysis, 4th Edition
#     by A. A. Afifi, V. Clark and S. May, chapter 16
#     See http:#www.ats.ucla.edu/stat/Stata/examples/cama4/default.htm
ror5 = NumericalVariable("ror5", Vector([ \
    13.00, 13.00, 13.00, 12.20, 10.00, 9.80, 9.90, 10.30, \
    9.50, 9.90, 7.90, 7.30, 7.80, 6.50, 9.20, 8.90, 8.40, \
    9.00, 12.90, 15.20, 18.40, 9.90, 9.90, 10.20, 9.20]))
de = NumericalVariable("de", Vector([ \
    .70, .70, .40, .20, .40, .50, .50, .30, \
    .40, .40, .40, .60, .40, .40, 2.70, .90, \
     1.20, 1.10, .30, .70, .20, 1.60, 1.10, .50, 1.00]))
salesgr5 = NumericalVariable("salesgr5", Vector([ \
    20.20, 17.20, 14.50, 12.90, 13.60, 12.10, 10.20, 11.40, \
    13.50, 12.10, 10.80, 15.40, 11.00, 18.70, 39.80, 27.80, \
    38.70, 22.10, 16.00, 15.30, 15.00, 9.60, 17.90, 12.60, 11.60]))
eps5 = NumericalVariable("eps5", Vector([ \
    15.50, 12.70, 15.10, 11.10,  8.00, 14.50, 7.00, 8.70, \
    5.90, 4.20, 16.00, 4.90, 3.00, -3.10, 34.40, 23.50, \
    24.60, 21.90, 16.20, 11.60, 11.60, 24.30, 15.30, 18.00, 4.50]))
npm1 = NumericalVariable("npm1", Vector([ \
    7.20, 7.30, 7.90, 5.40, 6.70, 3.80, 4.80, 4.50, \
    3.50, 4.60, 3.40, 5.10, 5.60, 1.30, 5.80, 6.70, \
    4.90, 6.00, 5.70, 1.50, 1.60, 1.00, 1.60, .90, .80]))
pe = NumericalVariable("pe", Vector([ \
    9.00, 8.00, 8.00, 9.00, 5.00, 6.00, 10.00, 9.00, \
    11.00, 9.00, 7.00, 7.00, 7.00, 10.00, 21.00, 22.00, \
    19.00, 19.00, 14.00, 8.00, 9.00, 6.00, 8.00, 6.00, 7.00]))
payoutr1 = NumericalVariable("payoutr1", Vector([ \
    .4263980, .3806930, .4067800, .5681820, .3245440, .5108083, \
    .3789130, .4819280, .5732480, .4907980, .4891300, .2722770, \
    .3156460, .3840000, .3908790, .1612900, .3030300, .3033180, \
    .2875000, .5989300, .5783130, .1949460, .3210700, .4537310, \
    .5949660]))
variables = Array[NumericalVariable]([ ror5, de, salesgr5, eps5, npm1, pe, payoutr1 ])
collection = VariableCollection(variables) 

# 
# Hierarchical cluster analysis
#

print "Hierarchical clustering"

# Create the model:
hc = HierarchicalClusterAnalysis(variables)
# Rescale the variables to their Z-scores before doing the analysis:
hc.Standardize = True
# The linkage method defaults to Centroid:
hc.LinkageMethod = LinkageMethod.Centroid
# We could set the distance measure. We use the default:
hc.DistanceMeasure = DistanceMeasures.SquaredEuclidianDistance

# Compute the model:
hc.Compute()

# We can partition the cases into clusters:
partition = hc.GetClusterPartition(5)
# Individual clusters are accessed through an index, or through enumeration.            
for cluster in partition:
    print "Cluster {0} has {1} members.".format(cluster.Index, cluster.Size)

# And get a filter for the observations in a single cluster:
collection.Filter = partition[3].MemberFilter
print "Number of items in filtered collection:", collection.Observations.Count
collection.Filter = None

# Get a variable that shows memberships:
memberships = partition.GetMemberships()
for i in range(15, memberships.Length):
    print "Observation {0} belongs to cluster {1}".format(i, memberships.GetLevelIndex(i))

# A dendrogram is a graphical representation of the clustering in the form of a tree.
# You can get all the information you need to draw a dendrogram starting from 
# the root node of the dendrogram:
root = hc.DendrogramRoot
# Position and DistanceMeasure give the x and y coordinates:
print "Root position: ({0:.4f}, {1:.4f})".format(root.Position, root.DistanceMeasure)
# The left and right children:
print "Position of left child: {0:.4f}".format(root.LeftChild.Position)
print "Position of right child: {0:.4f}".format(root.RightChild.Position)

# You can also get a filter that defines a sort order suitable for
# drawing the dendrogram:
sortOrder = hc.GetDendrogramOrder()
print 

#
# K-Means Clustering
#

print "K-means clustering"

# Create the model:
kmc = KMeansClusterAnalysis(variables, 3)
# Rescale the variables to their Z-scores before doing the analysis:
kmc.Standardize = True

# Compute the model:
kmc.Compute()

# We can partition the cases into clusters:
clusters = kmc.GetClusters()
# Individual clusters are accessed through an index, or through enumeration.            
for cluster in clusters:
    print "Cluster {0} has {1} members. Sum of squares: {2:.4f}".format(cluster.Index, cluster.Size, cluster.SumOfSquares)
    print "Center: {0:.4f}".format(cluster.Center)

# The distances between clusters are also available:
print "{0:.4f}".format(kmc.GetClusterDistances())

# You can get a filter for the observations in a single cluster:
collection.Filter = clusters[1].MemberFilter
print "Number of items in filtered collection:", collection.Observations.Count

# Get a variable that shows memberships:
memberships = clusters.GetMemberships()
# And one that shows the distances to the centers:
distances = clusters.GetDistancesToCenters()
for i in range(18, memberships.Length):
    print "Observation {0} belongs to cluster {1}, distance: {2:.4f}.".format(i, memberships.GetLevelIndex(i), distances[i])
```