Generalized Linear Models in C# QuickStart Sample

Illustrates how to use the GeneralizedLinearModel class to compute probit, Poisson and similar regression models in C#.

View this sample in: Visual Basic F#

using System;

using Extreme.Data.Text;
using Extreme.DataAnalysis;
using Extreme.Mathematics;
using Extreme.Statistics;

namespace Extreme.Numerics.QuickStart.CSharp
{

    /// <summary>
    /// Illustrates building generalized linear models using 
    /// the GeneralizedLinearModel class in the 
    /// Extreme.Statistics namespace of Extreme Numerics.NET.
    /// </summary>
    class GeneralizedLinearModels
    {
        static void Main(string[] args)
        {
            // The license is verified at runtime. We're using
            // a demo license here. For more information, see
            // https://numerics.net/trial-key
            Extreme.License.Verify("Demo license");

            // Generalized linear models can be computed using 
            // the GeneralizedLinearModel class.

            //
            // Poisson regression
            //

            // This QuickStart sample uses data about the attendance of 316 students
            // from two urban high schools. The fields are as follows:
            //   daysabs: The number of days the student was absent.
            //   male:    A binary indicator of gender.
            //   math:    The student's standardized math score.
            //   langarts:The student's standardized language arts score.
            //
            // We want to investigate the relationship between these variables.
            // 
            // See http://www.ats.ucla.edu/stat/stata/dae/poissonreg.htm

            // First, read the data from a file into a VariableCollection.
            // The ReadAttendanceData method is defined later in this file.
            var data = ReadAttendanceData();

            // Now create the regression model. Parameters are the name 
            // of the dependent variable, a string array containing 
            // the names of the independent variables, and the VariableCollection
            // containing all variables.
            var model = new GeneralizedLinearModel(data,
                "daysabs", new string[] { "math", "langarts", "male" });

            model = new GeneralizedLinearModel(data,
                "daysabs ~ math + langarts + male");

            // The ModelFamily specifies the distribution of the dependent variable.
            // Since we're dealing with count data, we use a Poisson model:
            model.ModelFamily = ModelFamily.Poisson;

            // The LinkFunction specifies the relationship between the dependent variable
            // and the linear predictor of independent variables. In this case,
            // we use the canonical link function, which is the default.
            model.LinkFunction = ModelFamily.Poisson.CanonicalLinkFunction;

            // The Fit method performs the actual regression analysis.
            model.Fit();

            // The Parameters collection contains information about the regression 
            // parameters.
            Console.WriteLine("Variable              Value    Std.Error    z     p-Value");
            foreach (var parameter in model.Parameters)
            {
                // Parameter objects have the following properties:
                Console.WriteLine("{0,-20}{1,10:F6}{2,10:F6}{3,8:F2} {4,7:F5}",
                    // Name, usually the name of the variable:
                    parameter.Name,
                    // Estimated value of the parameter:
                    parameter.Value,
                    // Standard error:
                    parameter.StandardError,
                    // The value of the z score for the hypothesis that the parameter
                    // is zero.
                    parameter.Statistic,
                    // Probability corresponding to the t statistic.
                    parameter.PValue);
            }
            Console.WriteLine();

            // In addition to these properties, Parameter objects have a GetConfidenceInterval
            // method that returns a confidence interval at a specified confidence level.
            // Notice that individual parameters can be accessed using their numeric index.
            // Parameter 0 is the intercept, if it was included.
            Interval confidenceInterval = model.Parameters[0].GetConfidenceInterval(0.95);
            Console.WriteLine("95% confidence interval for math score: {0:F4} - {1:F4}",
                confidenceInterval.LowerBound, confidenceInterval.UpperBound);

            // Parameters can also be accessed by name:
            confidenceInterval = model.Parameters.Get("math").GetConfidenceInterval(0.95);
            Console.WriteLine("95% confidence interval for math score: {0:F4} - {1:F4}",
                confidenceInterval.LowerBound, confidenceInterval.UpperBound);
            Console.WriteLine();

            // There is also a wealth of information about the analysis available
            // through various properties of the GeneralizedLinearModel object:
            Console.WriteLine("Log likelihood:         {0:F4}", model.LogLikelihood);
            Console.WriteLine("Kernel log likelihood:  {0:F4}", model.GetKernelLogLikelihood());

            // Note that some statistical applications (notably stata) use 
            // a different definition of some of these "information criteria":
            Console.WriteLine("\"Information Criteria\"");
            Console.WriteLine("Akaike (AIC):           {0:F3}", model.GetAkaikeInformationCriterion());
            Console.WriteLine("Corrected AIC:          {0:F3}", model.GetCorrectedAkaikeInformationCriterion());
            Console.WriteLine("Bayesian (BIC):         {0:F3}", model.GetBayesianInformationCriterion());
            Console.WriteLine("Chi Square:             {0:F3}", model.GetChiSquare());
            Console.WriteLine();

            //
            // Probit regression
            // 

            // In a second example, we investigate the relationship between whether a student
            // graduates, and the student's GRE scores,grade point averages, the level 
            // of the school from a "top notch" school. The fields are as follows:
            //   admit:    Dependent variable
            //   gre:      The student's GRE score.
            //   topnotch: A binary indicator of the type of school
            //   gpa:      The student's Grade Point Average.
            // 
            // The data was generated.
            // See http://www.ats.ucla.edu/stat/stata/dae/probit.htm

            // First, read the data from a file into a VariableCollection.
            // The ReadGraduateData method is defined later in this file.
            data = ReadGraduateData();

            // Now create the regression model. Parameters are the name 
            // of the dependent variable, a string array containing 
            // the names of the independent variables, and the VariableCollection
            // containing all variables.
            model = new GeneralizedLinearModel(data,
                "admit", new string[] { "gre", "topnotch", "gpa" });

            // The ModelFamily specifies the distribution of the dependent variable.
            // Since we're dealing with binary data, we use a Binomial model:
            model.ModelFamily = ModelFamily.Binomial;

            // We use the probit link function.
            model.LinkFunction = LinkFunction.Probit;

            // The Fit method performs the actual regression analysis.
            model.Fit();

            // The Parameters collection contains information about the regression 
            // parameters.
            Console.WriteLine("Variable              Value    Std.Error    z     p-Value");
            foreach (var parameter in model.Parameters)
            {
                Console.WriteLine("{0,-20}{1,10:F6}{2,10:F6}{3,8:F2} {4,7:F5}",
                    parameter.Name,
                    parameter.Value,
                    parameter.StandardError,
                    parameter.Statistic,
                    parameter.PValue);
            }
            Console.WriteLine();

            // There is also a wealth of information about the analysis available
            // through various properties of the GeneralizedLinearModel object:
            Console.WriteLine("Log likelihood:         {0:F4}", model.LogLikelihood);
            Console.WriteLine("Kernel log likelihood:  {0:F4}", model.GetKernelLogLikelihood());

            // Note that some statistical applications (notably stata) use 
            // a different definition of some of these "information criteria":
            Console.WriteLine("\"Information Criteria\"");
            Console.WriteLine("Akaike (AIC):           {0:F3}", model.GetAkaikeInformationCriterion());
            Console.WriteLine("Corrected AIC:          {0:F3}", model.GetCorrectedAkaikeInformationCriterion());
            Console.WriteLine("Bayesian (BIC):         {0:F3}", model.GetBayesianInformationCriterion());
            Console.WriteLine("Chi Square:             {0:F3}", model.GetChiSquare());
            Console.WriteLine();

            Console.Write("Press any key to exit.");
            Console.ReadLine();
        }

        public static DataFrame<long, string> ReadAttendanceData()
        {
            return DelimitedTextFile.ReadDataFrame(@"..\..\..\..\data\PoissonReg.csv");
        }

        public static DataFrame<long, string> ReadGraduateData()
        {
            var df = FixedWidthTextFile.ReadDataFrame(@"..\..\..\..\data\probit.dat",
                new FixedWidthTextOptions(new int[] { 9, 18, 27 }, columnHeaders:false));
            var columnNames = new string[] { "admit", "gre", "topnotch", "gpa" };
            return df.WithColumnIndex(columnNames);
        }
    }
}