Cluster Analysis in Visual Basic QuickStart Sample

Illustrates how to use the classes in the Extreme.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in Visual Basic.

View this sample in: C# F# IronPython

Option Infer On

Imports Extreme.Data.Stata
Imports Extreme.Mathematics
Imports Extreme.Statistics
Imports Extreme.Statistics.Multivariate

' <summary>
' Demonstrates how to use classes that implement
' hierarchical and K-means clustering.
' </summary>
Module ClusterAnalysisExample

    Sub Main()
        ' The license is verified at runtime. We're using
        ' a demo license here. For more information, see
        Extreme.License.Verify("Demo license")

        ' This QuickStart Sample demonstrates how to run two
        ' common multivariate analysis techniques:
        ' hierarchical cluster analysis and K-means cluster analysis.
        ' The classes used in this sample reside in the
        ' Extreme.Statistics.Multivariate namespace..

        ' First, our dataset, which is from
        '     Computer-Aided Multivariate Analysis, 4th Edition
        '     by A. A. Afifi, V. Clark and S. May, chapter 16
        '     See http:'
        Dim frame = StataFile.ReadDataFrame("..\..\..\..\Data\companies.dta")

        ' Hierarchical cluster analysis

        Console.WriteLine("Hierarchical clustering")

        ' Create the model:
        Dim columns = {"ror5", "de", "salesgr5", "eps5", "npm1", "pe", "payoutr1"}
        Dim hc = New HierarchicalClusterAnalysis(frame, columns)
        ' Alternatively, we could use a formula to specify the variables
        Dim formula = "ror5 + de + salesgr5 + eps5 + npm1 + pe + payoutr1"
        hc = New HierarchicalClusterAnalysis(frame, formula)

        ' Rescale the variables to their Z-scores before doing the analysis:
        hc.Standardize = True
        ' The linkage method defaults to centroid:
        hc.LinkageMethod = LinkageMethod.Centroid
        ' We could set the distance measure. We use the default:
        hc.DistanceMeasure = DistanceMeasures.SquaredEuclideanDistance

        ' Fit the model:

        ' We can partition the cases into clusters:
        Dim partition As HierarchicalClusterCollection = hc.GetClusterPartition(5)
        ' Individual clusters are accessed through an index, or through enumeration.            
        For Each cluster As HierarchicalCluster In partition
            Console.WriteLine("Cluster {0} has {1} members.", cluster.Index, cluster.Size)

        ' And get the indexes of the observations in a single cluster:
        Dim indexes = partition(3).MemberIndexes
        Console.WriteLine("Number of items in the partition: {0}", indexes.Length)

        ' Get a variable that shows memberships:
        Dim memberships = partition.GetMemberships()
        For i As Integer = 15 To memberships.Length - 1
            Console.WriteLine("Observation {0} belongs to cluster {1}", i, memberships.GetLevelIndex(i))
        Next i

        ' A dendrogram is a graphical representation of the clustering in the form of a tree.
        ' You can get all the information you need to draw a dendrogram starting from 
        ' the root node of the dendrogram:
        Dim root As DendrogramNode = hc.DendrogramRoot
        ' Position and DistanceMeasure give the x and y coordinates:
        Console.WriteLine("Root position: ({0:F4}, {1:F4})", root.Position, root.DistanceMeasure)
        ' The left and right children:
        Console.WriteLine("Position of left child: {0:F4}", root.LeftChild.Position)
        Console.WriteLine("Position of right child: {0:F4}", root.RightChild.Position)

        ' You can also get a filter that defines a sort order suitable for
        ' drawing the dendrogram:
        Dim sortOrder = hc.GetDendrogramOrder()

        ' K-Means Clustering

        Console.WriteLine("K-means clustering")

        ' Create the model. We need to specify the number of clusters up front:
        Dim kmc As New KMeansClusterAnalysis(frame, columns, 3)
        ' Rescale the variables to their Z-scores before doing the analysis:
        kmc.Standardize = True

        ' Fit the model:

        ' The Predictions property Is a categorical vector that contains
        ' the cluster assignments
        Dim predictions = kmc.Predictions
        ' The GetDistancesToCenters method returns a vector containing
        ' the distance of each observations to its center.
        Dim distances = kmc.GetDistancesToCenters()

        ' For example
        For i = 18 To predictions.Length - 1
            Console.WriteLine("Observation {0} belongs to cluster {1}, distance: {2:F4}.",
                    i, predictions(i), distances(i))
        ' You can use this to compute several statistics
        Dim Descriptives = distances.SplitBy(predictions).
                Map(Function(x) New Descriptives(Of Double)(x))

        ' Individual clusters are accessed through an index, Or through enumeration.            
        For i = 0 To Descriptives.Length - 1
            Console.WriteLine("Cluster {0} has {1} members. Sum of squares: {2:F4}",
                    i, Descriptives(i).Count, Descriptives(i).SumOfSquares)
            Console.WriteLine("Center: {0:F4}", kmc.Clusters(i))

        ' The distances between clusters are also available

        ' You can get a filter for the observations in a single cluster.
        ' This uses the GetIndexes method of categorical vectors.
        Dim level1Indexes = kmc.Predictions.GetIndexes(1).ToArray()
        Console.WriteLine("Number of items in cluster 1: {0}", level1Indexes.Length)

        Console.Write("Press any key to exit.")
    End Sub

End Module