Logistic Regression in IronPython QuickStart Sample
Illustrates how to use the LogisticRegressionModel class to create logistic regression models in IronPython.
View this sample in: C# Visual Basic F#
```Python import numerics from System import Char, Array from Extreme.Statistics import * from Extreme.Mathematics import * from Extreme.Mathematics.LinearAlgebra.IO import * # Illustrates building logistic regression models using # the LogisticRegressionModel class in the # Extreme.Statistics namespace of the Extreme # Optimization Numerical Libraries for .NET. # Logistic regression can be performed using # the LogisticRegressionModel class. # # This QuickStart sample uses data from a study of factors # that determine low birth weight at Baystate Medical Center. # from Belsley, Kuh and Welsch. The fields are as follows: # AGE: Mother's age. # LWT: Mother's weight. # RACE: 1=white, 2=black, 3=other. # FVT: Number of physician visits during the 1st trimester. # LOW: Low birth weight indicator. # First, read the data from a file into an ADO.NET DataTable. # For the sake of clarity, we put this code in its own method. import clr clr.AddReference("System.Data") from System.Data import * from System.IO import * # <summary> # Reads the data from a text file into a <see cref="DataTable"/>. # </summary> # <returns>A <see cref="DataTable"/></returns> def ReadData(): data = DataTable("LowBirthWeight") # data.Columns.Add("Key", typeof(string)) whitespace = Array[Char]([ ' ', '\t' ]) sr = StreamReader(r"..\Data\lowbwt.txt") # Read the header and extract the field names. line = sr.ReadLine() pos = 0 while True: while Char.IsWhiteSpace(line[pos]): pos = pos + 1 pos2 = line.IndexOfAny(whitespace, pos) if pos2 < 0: data.Columns.Add(line.Substring(pos), float) break else: data.Columns.Add(line.Substring(pos, pos2 - pos), float) pos = pos2 if pos < 0: break # Now read the data and add them to the table. # Assumes all columns except the first are numerical. rowData = Array.CreateInstance(object, data.Columns.Count) line = sr.ReadLine() while line != None and line.Length > 0: column = 0 pos = 0 while True: while Char.IsWhiteSpace(line[pos]): pos = pos + 1 pos2 = line.IndexOfAny(whitespace, pos) if pos2 < 0: field = line.Substring(pos) else: field = line.Substring(pos, pos2 - pos) if column == 0: rowData[column] = field else: rowData[column] = float.Parse(field) column = column + 1 pos = pos2 if pos < 0 or column >= data.Columns.Count: break data.Rows.Add(rowData) line = sr.ReadLine() return data dataTable = ReadData() data = VariableCollection(dataTable) # We need indicator variables for the race. We use the # CreateIndicatorVariable method: race = data["RACE"] race2 = race.CreateIndicatorVariable(2.0) data.Add(race2) race3 = race.CreateIndicatorVariable(3.0) data.Add(race3) # Now create the regression model. Parameters are the name # of the dependent variable, a string array containing # the names of the independent variables, and the VariableCollection # containing all variables. model = LogisticRegressionModel(data, "LOW", Array[str] ([ "AGE", "LWT", "RACE(2)", "RACE(3)", "FTV" ])) # The Compute method performs the actual regression analysis. model.Compute() # The Parameters collection contains information about the regression # parameters. print "Variable Value Std.Error t-stat p-Value" for parameter in model.Parameters: # Parameter objects have the following properties: print "{0:20}{1:10.5f}{2:10.5f}{3:8.2f} {4:7.4f}".format( # Name, usually the name of the variable: parameter.Name, # Estimated value of the parameter: parameter.Value, # Standard error: parameter.StandardError, # The value of the t statistic for the hypothesis that the parameter # is zero. parameter.Statistic, # Probability corresponding to the t statistic. parameter.PValue) # The log-likelihood of the computed solution is also available: print "Log-likelihood: {0:.4f}".format( model.GetLogLikelihood()) # We can test the significance by looking at the results # of a log-likelihood test, which compares the model to # a constant-only model: lrt = model.GetLikelihoodRatioTest() print "Likelihood-ratio test: chi-squared={0:.4f}, p={1:.4f}".format(lrt.Statistic, lrt.PValue) print # We can compute a model with fewer parameters: model2 = LogisticRegressionModel(data, "LOW", Array[str] ([ "LWT", "RACE(2)", "RACE(3)" ])) model2.Compute() # Print the results... print "Variable Value Std.Error t-stat p-Value" for parameter in model2.Parameters: print "{0:20}{1:10.5f}{2:10.5f}{3:8.2f} {4:7.4f}".format( \ parameter.Name, parameter.Value, parameter.StandardError, parameter.Statistic, parameter.PValue) # ...including the log-likelihood: print "Log-likelihood: {0:.4f}".format(model.GetLogLikelihood()) # We can now compare the original model to this one, once again # using the likelihood ratio test: lrt = model.GetLikelihoodRatioTest(model2) print "Likelihood-ratio test: chi-squared={0:.4f}, p={1:.4f}".format(lrt.Statistic, lrt.PValue) print # # Multinomial (polytopous) logistic regression # # The LogisticRegressionModel class can also be used # for logistic regression with more than 2 responses. # The following example is from "Applied Linear Statistical # Models." # Load the data into a matrix from System.Globalization import NumberStyles reader = FixedWidthMatrixReader( \ File.OpenText(r"..\Data\mlogit.txt"), 0, \ Array[int]([5, 10, 15, 20, 25, 32, 37, 42, 47 ]), \ NumberStyles.Integer, None) m = reader.ReadMatrix() # Next, convert the columns to variables. # For multinomial regression, the response variable must be # a CategoricalVariable: duration = NumericalVariable("duration", m.GetColumn(1)).ToCategoricalVariable() nutritio = NumericalVariable("nutritio", m.GetColumn(5)) agecat1 = NumericalVariable("agecat1", m.GetColumn(6)) agecat3 = NumericalVariable("agecat3", m.GetColumn(7)) alcohol = NumericalVariable("alcohol", m.GetColumn(8)) smoking = NumericalVariable("smoking", m.GetColumn(9)) # The constructor takes an extra argument of type # LogisticRegressionMethod: model3 = LogisticRegressionModel(duration, \ Array[NumericalVariable]([ nutritio, agecat1, agecat3, alcohol, smoking ]), \ LogisticRegressionMethod.Nominal) # Everything else is the same: model3.Compute() # There is a set of parameters for each level of the # response variable. The highest level is the reference # level and has no associated parameters. for p in model3.Parameters: print p.ToString() print "Log likelihood: {0:.4f}".format(model3.GetLogLikelihood()) # To test the hypothesis that all the slopes are zero, # use the GetLikelihoodRatioTest method. lrt = model3.GetLikelihoodRatioTest() print "Test that all slopes are zero: chi-squared={0:.4f}, p={1:.4f}".format(lrt.Statistic, lrt.PValue) ```