Data Wrangling in C# QuickStart Sample
Illustrates how to perform basic data wrangling or data munging operations on data frames using classes in the Numerics.NET.DataAnalysis namespace in C#.
This sample is also available in: Visual Basic, F#.
Overview
This QuickStart sample demonstrates essential data wrangling operations using Numerics.NET’s DataFrame class. Data wrangling, also known as data munging, is the process of transforming and mapping data from one format into another to make it more suitable for analysis.
The sample covers several key data wrangling operations:
- Joining data frames using different join types (outer, inner, left, right)
- Working with different data sources and combining them into a single data frame
- Handling missing data during joins
- Performing one-to-one and one-to-many joins using indexes and keys
- Time series data alignment using nearest-neighbor joins
- Sorting data frames by index or column values
The example uses realistic scenarios like combining presidential data with state information and aligning time series data with different timestamps. It demonstrates both exact matching joins and approximate matching for time series data that may be offset by a few hours.
The code
using System;
using System.Collections.Generic;
using Numerics.NET.DataAnalysis;
using Numerics.NET;
using Index = Numerics.NET.DataAnalysis.Index;
// Illustrates how to perform basic data wrangling operations
// on data frames.
// The license is verified at runtime. We're using
// a 30 day trial key here. For more information, see
// https://numerics.net/trial-key
Numerics.NET.License.Verify("your-trial-key-here");
//
// Joining and reshaping
//
// When data comes from different sources,
// the Append method lets you join the two
// data frames:
var frame = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "A", new string[] { "A0", "A1", "A2", "A3"} },
{ "B", new string[] { "B0", "B1", "B2", "B3"} },
{ "C", new string[] { "C0", "C1", "C2", "C3"} },
{ "D", new string[] { "D0", "D1", "D2", "D3" } } },
Index.Default(0, 3));
var df2 = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "A", new string[] { "A4", "A5", "A6", "A7"} },
{ "B", new string[] { "B4", "B5", "B6", "B7"} },
{ "C", new string[] { "C4", "C5", "C6", "C7"} },
{ "D", new string[] { "D4", "D5", "D6", "D7" } } },
Index.Default(4, 7));
var df12 = frame.Append(df2);
// It is possible to join more than 2 data frames:
var df3 = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "A", new string[] { "A8", "A9", "A10", "A11"} },
{ "B", new string[] { "B8", "B9", "B10", "B11"} },
{ "C", new string[] { "C8", "C9", "C10", "C11"} },
{ "D", new string[] { "D8", "D9", "D10", "D11" } } },
Index.Default(8, 11));
var df123 = DataFrame.Append(frame, df2, df3);
// When the columns don't match, you can specify
// a join operation which determines which columns
// to keep in the result. If a column is missing
// in a data frame and present in the result,
// missing values are inserted.
frame = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "A", new string[] { "A0", "A1", "A2", "A3"} },
{ "B", new string[] { "B0", "B1", "B2", "B3"} },
{ "C", new string[] { "C0", "C1", "C2", "C3"} } },
Index.Default(0, 3));
df2 = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "A", new string[] { "A4", "A5", "A6", "A7"} },
{ "B", new string[] { "B4", "B5", "B6", "B7"} },
{ "D", new string[] { "D4", "D5", "D6", "D7" } } },
Index.Default(4, 7));
var df12outer = frame.Append(df2, JoinType.Outer);
var df12Inner = frame.Append(df2, JoinType.Inner);
// Left column join is equivalent to using the left column index:
var df12Left = frame.Append(df2, JoinType.Left);
var df12Left2 = frame.Append(df2, frame.ColumnIndex);
// Again, these are equivalent:
var df12Right = frame.Append(df2, JoinType.Right);
var df12Right2 = frame.Append(df2, df2.ColumnIndex);
// One to one joins match rows on their keys:
var dates1 = Index.CreateDateRange(new DateTime(2015, 11, 11), 5, Recurrence.Daily);
var df4 = Vector.CreateRandom(5).ToDataFrame(dates1, "values1");
var dates2 = Index.CreateDateRange(dates1[2], 5, Recurrence.Daily);
var df5 = Vector.CreateRandom(5).ToDataFrame(dates2, "values2");
var df6 = DataFrame.Join(df4, JoinType.Outer, df5);
Console.WriteLine(df6);
// One to many joins match one data frame's index to another's
// column.
// Create a list of presidents:
var numbers = Index.Create(new[] { 44, 43, 42, 41, 40 });
var names = Vector.Create("Barack Obama", "George W. Bush", "Bill Clinton",
"George H.W. Bush", "Ronald Reagan");
var homeStates = Vector.Create("IL", "TX", "AR", "TX", "CA");
var presidents = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "Name", names }, { "Home state", homeStates } }, numbers);
// And a list of states indexed by their abbreviations:
var abbreviations = Index.Create(new[] { "AR", "CA", "GA", "MI", "IL", "TX" });
var stateNames = Vector.Create("Arkansas", "California", "Georgia",
"Michigan", "Illinois", "Texas");
var states = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "Full name", stateNames} }, abbreviations);
// Now get the full names of states in the list:
var presidentsWithState = DataFrame.Join(presidents, JoinType.Left, states, key: "Home state");
Console.WriteLine(presidentsWithState);
// When the indexes are sorted, it is possible
// to do an inexact join to the nearest value.
// This is useful for time series where one series
// if offset by a few hours relative to the other:
var dates7 = Index.CreateDateRange(new DateTime(2015, 11, 11), 5, Recurrence.Daily);
var df7 = Vector.CreateRandom(5).ToDataFrame(dates7, "values1");
var dates8 = Index.CreateDateRange(dates7[0].AddHours(3), 5, Recurrence.Daily);
var df8 = Vector.CreateRandom(5).ToDataFrame(dates8, "values2");
var df9 = df7.JoinOnNearest(df8, Direction.Backward);
Console.WriteLine(df9);
//
// Sorting and filtering
//
// Data frames can be sorted by their index or by
// a column. The sort methods always return a new data frame.
dates2 = Index.CreateDateRange(new DateTime(2015, 11, 11), 15, Recurrence.Daily);
var frame2 = DataFrame.FromColumns(new Dictionary<string, object>() {
{ "values1", Vector.CreateRandom(dates2.Length) },
{ "values2", Vector.CreateRandom(dates2.Length) },
{ "values3", Vector.CreateRandom(dates2.Length) } }, dates2);
var frame3 = frame2.SortByIndex(SortOrder.Descending);
var frame4 = frame2.SortBy("values1", SortOrder.Ascending);
Console.Write("Press Enter key to exit...");
Console.ReadLine();