Cluster Analysis in C# QuickStart Sample

Illustrates how to use the classes in the Extreme.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in C#.

View this sample in: Visual Basic F# IronPython

using System;
using System.Linq;

using Extreme.Data.Stata;
using Extreme.Mathematics;
using Extreme.Statistics;
using Extreme.Statistics.Multivariate;

namespace Extreme.Numerics.Quickstart.CSharp
    /// <summary>
    /// Demonstrates how to use classes that implement
    /// hierarchical and K-means clustering.
    /// </summary>
    class ClusterAnalysis
        static void Main(string[] args)
            // The license is verified at runtime. We're using
            // a demo license here. For more information, see
            Extreme.License.Verify("Demo license");

            // This QuickStart Sample demonstrates how to run two
            // common multivariate analysis techniques:
            // hierarchical cluster analysis and K-means cluster analysis.
            // The classes used in this sample reside in the
            // Extreme.Statistics.Multivariate namespace..

            // First, our dataset, which is from
            //     Computer-Aided Multivariate Analysis, 4th Edition
            //     by A. A. Afifi, V. Clark and S. May, chapter 16
            //     See
            var frame = StataFile.ReadDataFrame(@"..\..\..\..\Data\companies.dta");

            // Hierarchical cluster analysis

            Console.WriteLine("Hierarchical clustering");

            // Create the model:
            var columns = new string[] { "ror5", "de", "salesgr5", "eps5", "npm1", "pe", "payoutr1" };
            var hc = new HierarchicalClusterAnalysis(frame, columns);
            // Alternatively, we could use a formula to specify the variables:
            string formula = "ror5 + de + salesgr5 + eps5 + npm1 + pe + payoutr1";
            hc = new HierarchicalClusterAnalysis(frame, formula);

            // Rescale the variables to their Z-scores before doing the analysis:
            hc.Standardize = true;
            // The linkage method defaults to Centroid:
            hc.LinkageMethod = LinkageMethod.Centroid;
            // We could set the distance measure. We use the default:
            hc.DistanceMeasure = DistanceMeasures.SquaredEuclideanDistance;

            // Compute the model:

            // We can partition the cases into clusters:
            HierarchicalClusterCollection partition = hc.GetClusterPartition(5);
            // Individual clusters are accessed through an index, or through enumeration.            
            foreach(HierarchicalCluster cluster in partition)
                Console.WriteLine("Cluster {0} has {1} members.", cluster.Index, cluster.Size);

            var count3 = partition[3].Size;
            Console.WriteLine("Number of items in cluster 3: {0}", count3);

            // Get a variable that shows memberships:
            var memberships = partition.GetMemberships();
            for (int i = 15; i < memberships.Length; i++)
                Console.WriteLine("Observation {0} belongs to cluster {1}", i, memberships[i].Index);

            // A dendrogram is a graphical representation of the clustering in the form of a tree.
            // You can get all the information you need to draw a dendrogram starting from 
            // the root node of the dendrogram:
            DendrogramNode root = hc.DendrogramRoot;
            // Position and DistanceMeasure give the x and y coordinates:
            Console.WriteLine("Root position: ({0:F4}, {1:F4}", root.Position, root.DistanceMeasure);
            // The left and right children:
            Console.WriteLine("Position of left child: {0:F4}", root.LeftChild.Position);
            Console.WriteLine("Position of right child: {0:F4}", root.RightChild.Position);

            // You can also get a permutation that orders the observations
            // in a way suitable for drawing the dendrogram:
            var sortOrder = hc.GetDendrogramOrder();

            // K-Means Clustering

            Console.WriteLine("K-means clustering");

            // Create the model:
            KMeansClusterAnalysis kmc = new KMeansClusterAnalysis(frame, columns, 3);
            // Rescale the variables to their Z-scores before doing the analysis:
            kmc.Standardize = true;

            // Compute the model:

            // The Predictions property is a categorical vector that contains
            // the cluster assignments:
            var predictions = kmc.Predictions;
            // The GetDistancesToCenters method returns a vector containing
            // the distance of each observations to its center.
            var distances = kmc.GetDistancesToCenters();

            // For example:
            for (int i = 18; i < predictions.Length; i++)
                Console.WriteLine("Observation {0} belongs to cluster {1}, distance: {2:F4}.",
                    i, predictions[i], distances[i]);

            // You can use this to compute several statistics:
            var descriptives = distances.SplitBy(predictions)
                .Map(x => new Descriptives<double>(x));

            // Individual clusters are accessed through an index, or through enumeration.            
            for (int i = 0; i < descriptives.Length; i++)
                Console.WriteLine("Cluster {0} has {1} members. Sum of squares: {2:F4}", 
                    i, descriptives[i].Count, descriptives[i].SumOfSquares);
                Console.WriteLine("Center: {0:F4}", kmc.Clusters[i].Center);

            // The distances between clusters are also available:

            // You can get a filter for the observations in a single cluster.
            // This uses the GetIndexes method of categorical vectors.
            var level1Indexes = kmc.Predictions.GetIndexes(1).ToArray();
            Console.WriteLine("Number of items in cluster 1: {0}", level1Indexes.Length);

            Console.Write("Press any key to exit.");