Principal Component Analysis (PCA) in IronPython QuickStart Sample

Illustrates how to perform a Principal Components Analysis using classes in the Extreme.Statistics.Multivariate namespace in IronPython.

View this sample in: C# Visual Basic F#

import numerics

from Extreme.Mathematics import *
from Extreme.Mathematics.LinearAlgebra.IO import *
from Extreme.Statistics import *
from Extreme.Statistics.Multivariate import *

# Demonstrates how to use classes that implement
# Principal Component Analysis (PCA).

# This QuickStart Sample demonstrates how to perform
# a principal component analysis on a set of data.
# The classes used in this sample reside in the
# Extreme.Statistics.Multivariate namespace..

# First, our dataset, 'depress.txt', which is from
#     Computer-Aided Multivariate Analysis, 4th Edition
#     by A. A. Afifi, V. Clark and S. May, chapter 16
#     See

# The data is in delimited text format. Use a matrix reader to load it into a matrix.
reader = DelimitedTextMatrixReader(r"..\Data\Depress.txt")
reader.MergeConsecutiveDelimiters = True
reader.SetColumnDelimiters(' ')
m = reader.ReadMatrix()

# The data we want is in columns 8 through 27:
m = m.GetSubmatrix(0, m.RowCount - 1, 8, 27)

# Principal component analysis

# We can construct PCA objects in many ways. Since we have the data in a matrix, # we use the constructor that takes a matrix as input.
pca = PrincipalComponentAnalysis(m)
# and immediately perform the analysis:
# We can get the contributions of each component:
print " #    Eigenvalue Difference Contribution Contrib. %"
for i in range(5):
    # We get the ith component from the model...
    component = pca.Components[i]
    # and write out its properties
    print "{0:2}{1:12.4f}{2:11.4f}{2:14.3f}%{3:10.3f}%" \
        .format( i, component.Eigenvalue, component.EigenvalueDifference, \
        100 * component.ProportionOfVariance, 100 * component.CumulativeProportionOfVariance)

# To get the proportions for all components, use the
# properties of the PCA object:
proportions = pca.VarianceProportions

# To get the number of components that explain a given proportion
# of the variation, use the GetVarianceThreshold method:
count = pca.GetVarianceThreshold(0.9)
print "Components needed to explain 90% of variation:", count

# The value property gives the components themselves:
print "Components:"
print "Var.      1       2       3       4       5"
pcs = pca.Components
for i in range(pcs.Count):
    print "{0:4}{1:8.4f}{2:8.4f}{3:8.4f}{4:8.4f}{5:8.4f}" \
        .format(i, pcs[0].Value[i], pcs[1].Value[i], pcs[2].Value[i], pcs[3].Value[i], pcs[4].Value[i])

# The scores are the coefficients of the observations expressed as a combination
# of principal components.
scores = pca.ScoreMatrix

# To get the predicted observations based on a specified number of components, # use the GetPredictions method.
prediction = pca.GetPredictions(count)
print "Predictions using", count, "components:"
print "   Pr. 1  Act. 1   Pr. 2  Act. 2   Pr. 3  Act. 3   Pr. 4  Act. 4", count
for i in range(0, 10):
    print "{0:8.4f}{1:8.4f}{2:8.4f}{3:8.4f}{4:8.4f}{5:8.4f}{6:8.4f}{7:8.4f}" \
        .format(prediction[0].GetValue(i), m[i, 0], prediction[1].GetValue(i), m[i, 1], \
        prediction[2].GetValue(i), m[i, 2], prediction[3].GetValue(i), m[i, 3])