Cluster Analysis in F# QuickStart Sample

Illustrates how to use the classes in the Extreme.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in F#.

View this sample in: C# Visual Basic IronPython

#light

open System
open System.Linq

open Extreme.Data.Stata
open Extreme.Mathematics
open Extreme.Statistics
open Extreme.Statistics.Multivariate

// The license is verified at runtime. We're using a demo license here.
// For more information, see:
// https://numerics.net/trial-key
let licensed = Extreme.License.Verify("Demo license")

// Demonstrates how to use classes that implement
// hierarchical and K-means clustering.

// This QuickStart Sample demonstrates how to run two
// common multivariate analysis techniques:
// hierarchical cluster analysis and K-means cluster analysis.
//
// The classes used in this sample reside in the
// Extreme.Statistics.Multivariate namespace..

// First, our dataset, which is from
//     Computer-Aided Multivariate Analysis, 4th Edition
//     by A. A. Afifi, V. Clark and S. May, chapter 16
//     See http://www.ats.ucla.edu/stat/Stata/examples/cama4/default.htm
let frame = 
    let filename = @"..\..\..\..\Data\companies.dta"
    StataFile.ReadDataFrame(filename)

// 
// Hierarchical cluster analysis
//

printfn "Hierarchical clustering"

// Create the model:

let columns = [| "ror5"; "de"; "salesgr5"; "eps5"; "npm1"; "pe"; "payoutr1" |]
let hc = HierarchicalClusterAnalysis(frame, columns)

// Alternatively, we could use a formula to specify the variables:
let formula = "ror5 + de + salesgr5 + eps5 + npm1 + pe + payoutr1"
let hc2 = HierarchicalClusterAnalysis(frame, formula)

// Rescale the variables to their Z-scores before doing the analysis:
hc.Standardize <- true
// The linkage method defaults to Centroid:
hc.LinkageMethod <- LinkageMethod.Centroid
// We could set the distance measure. We use the default:
hc.DistanceMeasure <- DistanceMeasures.SquaredEuclideanDistance

// Fit the model:
hc.Fit()

// We can partition the cases into clusters:
let partition = hc.GetClusterPartition(5)
// Individual clusters are accessed through an index, or through enumeration.
for cluster in partition do
    printfn "Cluster %d has %d members." cluster.Index cluster.Size

// Get a variable that shows memberships:
let memberships = partition.GetMemberships()
for i in 15..memberships.Length-1 do
    printfn "Observation %d belongs to cluster %d" i (memberships.[i].Index)

// A dendrogram is a graphical representation of the clustering in the form of a tree.
// You can get all the information you need to draw a dendrogram starting from 
// the root node of the dendrogram:
let root = hc.DendrogramRoot
// Position and DistanceMeasure give the x and y coordinates:
printfn "Root position: (%.4f, %.4f)" root.Position root.DistanceMeasure
// The left and right children:
printfn "Position of left child: %.4f" root.LeftChild.Position
printfn "Position of right child: %.4f" root.RightChild.Position

// You can also get a filter that defines a sort order suitable for
// drawing the dendrogram:
let sortOrder = hc.GetDendrogramOrder()
printfn ""

//
// K-Means Clustering
//

printfn "K-means clustering"

// Create the model:
let kmc = KMeansClusterAnalysis(frame, columns, 3)
// Rescale the variables to their Z-scores before doing the analysis:
kmc.Standardize <- true

// Fit the model:
kmc.Fit()

// The Predictions property is a categorical vector that contains
// the cluster assignments:
let predictions = kmc.Predictions
// The GetDistancesToCenters method returns a vector containing
// the distance of each observations to its center.
let distances = kmc.GetDistancesToCenters()

// For example:
for i = 18 to predictions.Length-1 do
    printfn "Observation %d belongs to cluster %d, distance: %.4f."
        i predictions.[i] distances.[i]

// You can use this to compute several statistics:
let descriptives = distances.SplitBy(predictions)
                    .Map(fun x -> Descriptives<float>(x))

// We can partition the cases into clusters:
let clusters = kmc.Clusters
// Individual clusters are accessed through an index, or through enumeration.
for cluster in clusters do
    printfn "Cluster %d has %d members. Sum of squares: %.4f"
        cluster.Index cluster.Size cluster.SumOfSquares
    printfn "Center: %s" (cluster.Center.ToString("F4"))

// The distances between clusters are also available:
printfn "%s" (kmc.GetClusterDistances().ToString("F4"))

// You can get a filter for the observations in a single cluster:
let level1Indexes = (kmc.Predictions :> ICategoricalVector).GetIndexes(1).ToArray()
printfn "Number of items in cluster 1: %d" level1Indexes.Length
            
printf "Press any key to exit."
Console.ReadLine() |> ignore