Grouping and Aggregation in C# QuickStart Sample

Illustrates how to group data and how to compute aggregates over groups and entire datasets. in C#.

View this sample in: Visual Basic F#

using System;

using Numerics.NET.Data.Text;
using Numerics.NET.DataAnalysis;
using Numerics.NET;
using Index = Numerics.NET.DataAnalysis.Index;

namespace Numerics.NET.QuickStart.CSharp {

    /// <summary>
    /// Illustrates how to group data and how to compute aggregates 
    /// over groups and entire datasets.
    /// </summary>  
    class GroupingAndAggregation {

        static void Main(string[] args) {

            // The license is verified at runtime. We're using
            // a 30 day trial key here. For more information, see
            //     https://numerics.net/trial-key
            Numerics.NET.License.Verify("64542-18980-57619-62268");

            // We work with the Titanic dataset
            var titanic = DelimitedTextFile.ReadDataFrame(@"..\..\..\..\data\titanic.csv");
            // We'll use these columns often:
            var age = titanic.GetColumn("Age");
            var survived = titanic["Survived"].As<bool>();
            // We want to group by the passenger class,
            // so we make this a categorical vector.
            var pclass = titanic["Pclass"].AsCategorical();

            //
            // Aggregators and Aggregation
            // 

            // The Aggregators class defines all common aggregator functions.
            // Here we compute the mean and do the computations using the double
            // type. The Aggregate method applies the aggregator
            // to every column in the data frame:
            var means = titanic.Aggregate(Aggregators.Mean.As<double>());
            Console.WriteLine(means.Summarize());

            // We can create custom aggregators. Here we compute
            // the fraction of true values of a boolean vector:
            var trueFraction = Aggregators.Create(
                (Vector<bool> b) => (double)b.CountTrue() / b.Count);
            var pctSurvived = survived.Aggregate(trueFraction);

            // We can also compute more than one aggregate:
            var descriptives = titanic.Aggregate(
                Aggregators.Count,
                Aggregators.Mean.As<double>(),
                Aggregators.StandardDeviation.As<double>());
            Console.WriteLine(descriptives.Summarize());

            // Aggregations can be applied to individual vectors:
            var meanAge = age.Aggregate(Aggregators.Mean);

            // Or to rows or columns of a matrix:
            var m = Matrix.CreateRandom(5, 8);
            var meanByRow = m.AggregateRows(Aggregators.Mean);
            var meanByColumn = m.AggregateColumns(Aggregators.Mean);

            //
            // Groupings
            //

            // By defining a grouping, we can compute the aggregate
            // for each group.

            // The simplest grouping is by value, similar to 
            // GROUP BY clauses in database queries.

            // Let's get the average age by class:
            var ageByClass = age.AggregateBy(pclass, Aggregators.Mean);

            // Grouping by quantile means we sort the values
            // and divide the result into groups of the same size.
            var byQuantile = Grouping.ByQuantile(age, 5);
            var survivedByAgeGroup = survived.AggregateBy(byQuantile, trueFraction);
            Console.WriteLine("Survival rate by age group:");
            Console.WriteLine(survivedByAgeGroup.Summarize());

            // For the remainder we will use a vector with a DateTime index:
            var x = Vector.CreateRandom(200);
            var dates = Index.CreateDateRange(new DateTime(2016, 1, 1), x.Length);
            x.Index = dates;

            // A partition is a straight division of the data into equal groups:
            var partition = Grouping.Partition(dates, 10, 
                alignToEnd: true, skipIncomplete: true);
            var partitionAvg = x.AggregateBy(partition, Aggregators.Mean);
            Console.WriteLine("Avg. by partition:");
            Console.WriteLine(partitionAvg);

            //
            // Moving and expanding windows
            //

            // Moving or rolling averages and related statistics 
            // can be computed efficiently by using moving windows:
            var window = Grouping.Window(dates, 20);
            var ma20 = x.AggregateBy(window, Aggregators.Mean);
            Console.WriteLine("ma20:");
            Console.WriteLine(ma20.GetSlice(0, 20));
            // Moving standard deviation is just as simple:
            var mstd20 = x.AggregateBy(window, Aggregators.StandardDeviation);
            Console.WriteLine("mstd20:");
            Console.WriteLine(mstd20.GetSlice(0, 20));

            // Moving windows can have a fixed number of elements, as above,
            // or a fixed maximum width:
            var window2 = Grouping.RangeWindow(dates, TimeSpan.FromDays(20));
            var ma20_2= x.AggregateBy(window2, Aggregators.Mean);

            // Expanding windows keep the starting point and move the end point
            // forward in time:
            var expanding = Grouping.ExpandingWindow(dates);
            var expAvg = x.AggregateBy(expanding, Aggregators.Mean);
            Console.WriteLine("expAvg:");
            Console.WriteLine(expAvg.GetSlice(0, 10));

            // 
            // Resampling
            //

            // Resampling means computing values for a series 
            // with longer periods by aggregating over the values
            // for shorter periods.

            // We start by creating an index with the boundaries,
            // in this case the 10th of each month.
            var months = Index.CreateDateRange(new DateTime(2016, 1, 10), 
                12, Recurrence.Monthly);
            // We then create the resampling grouping from this:
            // Giving the Direction argument as Backward means that
            // the last value in the time period is used as the key
            // for the group.
            var resampling1 = Grouping.Resample(dates, months, Direction.Backward);
            // We can also obtain this grouping in one step:            
            var resampling2 = Grouping.Resample(dates, 
                Recurrence.Monthly.Day(10), Direction.Backward);
            var resampled = x.AggregateBy(resampling2, Aggregators.Mean);

            //
            // Pivot tables
            //

            // A pivot table is a 2-dimensional grouping on two key columns.
            // For this, we go back to the Titanic dataset, and we compute 
            // the survival rate per class in a different way. We group
            // by class and by whether the passenger survived:
            var pivot = Grouping.Pivot(
                titanic["Pclass"].As<int>(),
                titanic["Survived"].As<bool>());
            // We can then get the # of elements in each group
            // as a matrix, with rows indexed by class and columns
            // indexed by survived:
            var counts = pivot.CountsMatrix();
            // Scaling by the row sums gives us the fraction
            // of survived/did not survive for each class.
            // Notice that the rows and columns of the matrix
            // are labeled by the class and survival status:
            var fractions = counts.UnscaleRowsInPlace(counts.GetRowSums());
            Console.WriteLine(fractions.Summarize());

            Console.Write("Press any key to exit.");
            Console.ReadLine();
        }
    }
}