Cluster Analysis in IronPython QuickStart Sample
Illustrates how to use the classes in the Numerics.NET.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in IronPython.
View this sample in: C# Visual Basic F#
```Python from System import Array import numerics from Extreme.Mathematics import * from Extreme.Statistics import * from Extreme.Statistics.Multivariate import * # Demonstrates how to use classes that implement # hierarchical and K-means clustering. # This QuickStart Sample demonstrates how to run two # common multivariate analysis techniques: # hierarchical cluster analysis and K-means cluster analysis. # # The classes used in this sample reside in the # Extreme.Statistics.Multivariate namespace.. # First, our dataset, which is from # Computer-Aided Multivariate Analysis, 4th Edition # by A. A. Afifi, V. Clark and S. May, chapter 16 # See http:#www.ats.ucla.edu/stat/Stata/examples/cama4/default.htm ror5 = NumericalVariable("ror5", Vector([ \ 13.00, 13.00, 13.00, 12.20, 10.00, 9.80, 9.90, 10.30, \ 9.50, 9.90, 7.90, 7.30, 7.80, 6.50, 9.20, 8.90, 8.40, \ 9.00, 12.90, 15.20, 18.40, 9.90, 9.90, 10.20, 9.20])) de = NumericalVariable("de", Vector([ \ .70, .70, .40, .20, .40, .50, .50, .30, \ .40, .40, .40, .60, .40, .40, 2.70, .90, \ 1.20, 1.10, .30, .70, .20, 1.60, 1.10, .50, 1.00])) salesgr5 = NumericalVariable("salesgr5", Vector([ \ 20.20, 17.20, 14.50, 12.90, 13.60, 12.10, 10.20, 11.40, \ 13.50, 12.10, 10.80, 15.40, 11.00, 18.70, 39.80, 27.80, \ 38.70, 22.10, 16.00, 15.30, 15.00, 9.60, 17.90, 12.60, 11.60])) eps5 = NumericalVariable("eps5", Vector([ \ 15.50, 12.70, 15.10, 11.10, 8.00, 14.50, 7.00, 8.70, \ 5.90, 4.20, 16.00, 4.90, 3.00, -3.10, 34.40, 23.50, \ 24.60, 21.90, 16.20, 11.60, 11.60, 24.30, 15.30, 18.00, 4.50])) npm1 = NumericalVariable("npm1", Vector([ \ 7.20, 7.30, 7.90, 5.40, 6.70, 3.80, 4.80, 4.50, \ 3.50, 4.60, 3.40, 5.10, 5.60, 1.30, 5.80, 6.70, \ 4.90, 6.00, 5.70, 1.50, 1.60, 1.00, 1.60, .90, .80])) pe = NumericalVariable("pe", Vector([ \ 9.00, 8.00, 8.00, 9.00, 5.00, 6.00, 10.00, 9.00, \ 11.00, 9.00, 7.00, 7.00, 7.00, 10.00, 21.00, 22.00, \ 19.00, 19.00, 14.00, 8.00, 9.00, 6.00, 8.00, 6.00, 7.00])) payoutr1 = NumericalVariable("payoutr1", Vector([ \ .4263980, .3806930, .4067800, .5681820, .3245440, .5108083, \ .3789130, .4819280, .5732480, .4907980, .4891300, .2722770, \ .3156460, .3840000, .3908790, .1612900, .3030300, .3033180, \ .2875000, .5989300, .5783130, .1949460, .3210700, .4537310, \ .5949660])) variables = Array[NumericalVariable]([ ror5, de, salesgr5, eps5, npm1, pe, payoutr1 ]) collection = VariableCollection(variables) # # Hierarchical cluster analysis # print "Hierarchical clustering" # Create the model: hc = HierarchicalClusterAnalysis(variables) # Rescale the variables to their Z-scores before doing the analysis: hc.Standardize = True # The linkage method defaults to Centroid: hc.LinkageMethod = LinkageMethod.Centroid # We could set the distance measure. We use the default: hc.DistanceMeasure = DistanceMeasures.SquaredEuclidianDistance # Compute the model: hc.Compute() # We can partition the cases into clusters: partition = hc.GetClusterPartition(5) # Individual clusters are accessed through an index, or through enumeration. for cluster in partition: print "Cluster {0} has {1} members.".format(cluster.Index, cluster.Size) # And get a filter for the observations in a single cluster: collection.Filter = partition[3].MemberFilter print "Number of items in filtered collection:", collection.Observations.Count collection.Filter = None # Get a variable that shows memberships: memberships = partition.GetMemberships() for i in range(15, memberships.Length): print "Observation {0} belongs to cluster {1}".format(i, memberships.GetLevelIndex(i)) # A dendrogram is a graphical representation of the clustering in the form of a tree. # You can get all the information you need to draw a dendrogram starting from # the root node of the dendrogram: root = hc.DendrogramRoot # Position and DistanceMeasure give the x and y coordinates: print "Root position: ({0:.4f}, {1:.4f})".format(root.Position, root.DistanceMeasure) # The left and right children: print "Position of left child: {0:.4f}".format(root.LeftChild.Position) print "Position of right child: {0:.4f}".format(root.RightChild.Position) # You can also get a filter that defines a sort order suitable for # drawing the dendrogram: sortOrder = hc.GetDendrogramOrder() print # # K-Means Clustering # print "K-means clustering" # Create the model: kmc = KMeansClusterAnalysis(variables, 3) # Rescale the variables to their Z-scores before doing the analysis: kmc.Standardize = True # Compute the model: kmc.Compute() # We can partition the cases into clusters: clusters = kmc.GetClusters() # Individual clusters are accessed through an index, or through enumeration. for cluster in clusters: print "Cluster {0} has {1} members. Sum of squares: {2:.4f}".format(cluster.Index, cluster.Size, cluster.SumOfSquares) print "Center: {0:.4f}".format(cluster.Center) # The distances between clusters are also available: print "{0:.4f}".format(kmc.GetClusterDistances()) # You can get a filter for the observations in a single cluster: collection.Filter = clusters[1].MemberFilter print "Number of items in filtered collection:", collection.Observations.Count # Get a variable that shows memberships: memberships = clusters.GetMemberships() # And one that shows the distances to the centers: distances = clusters.GetDistancesToCenters() for i in range(18, memberships.Length): print "Observation {0} belongs to cluster {1}, distance: {2:.4f}.".format(i, memberships.GetLevelIndex(i), distances[i]) ```