Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tsv-sample inorder #226

Merged
merged 18 commits into from
Sep 23, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
More unit tests.
  • Loading branch information
jondegenhardt committed Sep 21, 2019
commit 7cc89e1ce0c0018c9fdc9de17f7121e52e24631d
156 changes: 109 additions & 47 deletions tsv-sample/src/tsv_utils/tsv-sample.d
Original file line number Diff line number Diff line change
Expand Up @@ -2075,6 +2075,45 @@ unittest
[["random_value", "field_a", "field_b", "field_c"],
["0.96055546286515892", "yellow", "黄", "12"]];

string[][] data3x6ExpectedWt3Num6Inorder =
[["field_a", "field_b", "field_c"],
["red", "赤", "23.8"],
["green", "緑", "0.0072"],
["white", "白", "1.65"],
["yellow", "黄", "12"],
["blue", "青", "12"],
["black", "黒", "0.983"]];

string[][] data3x6ExpectedWt3Num5Inorder =
[["field_a", "field_b", "field_c"],
["green", "緑", "0.0072"],
["white", "白", "1.65"],
["yellow", "黄", "12"],
["blue", "青", "12"],
["black", "黒", "0.983"]];

string[][] data3x6ExpectedWt3Num4Inorder =
[["field_a", "field_b", "field_c"],
["white", "白", "1.65"],
["yellow", "黄", "12"],
["blue", "青", "12"],
["black", "黒", "0.983"]];

string[][] data3x6ExpectedWt3Num3Inorder =
[["field_a", "field_b", "field_c"],
["yellow", "黄", "12"],
["blue", "青", "12"],
["black", "黒", "0.983"]];

string[][] data3x6ExpectedWt3Num2Inorder =
[["field_a", "field_b", "field_c"],
["yellow", "黄", "12"],
["black", "黒", "0.983"]];

string[][] data3x6ExpectedWt3Num1Inorder =
[["field_a", "field_b", "field_c"],
["yellow", "黄", "12"]];


string[][] data3x6ExpectedBernoulliProbsP100 =
[["random_value", "field_a", "field_b", "field_c"],
Expand Down Expand Up @@ -2159,6 +2198,7 @@ unittest
["white", "白", "1.65"],
["green", "緑", "0.0072"]];


string[][] data3x6ExpectedReplaceNum10 =
[["field_a", "field_b", "field_c"],
["black", "黒", "0.983"],
Expand Down Expand Up @@ -2821,7 +2861,7 @@ unittest
* Enough setup! Actually run some tests!
*/

/* Permutations. Headers, static seed, compatibility mode. With weights and without. */
/* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
Expand All @@ -2836,7 +2876,7 @@ unittest
testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);

/* Permutations, without compatibility mode, or with both compatibility and printing. */
/* Shuffling, without compatibility mode, or with both compatibility and printing. */
testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
Expand Down Expand Up @@ -2932,7 +2972,7 @@ unittest
testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);

/* Permutations, compatibility mode, without headers. */
/* Shuffling, compatibility mode, without headers. */
testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]);
testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]);
testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]);
Expand All @@ -2943,7 +2983,7 @@ unittest
testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]);

/* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */
/* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]);
testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]);
Expand All @@ -2965,6 +3005,7 @@ unittest
testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1..$]);
testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1..$]);

/* Reservoir sampling using Algorithm R, no headers, inorder output. */
testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1..$]);
Expand Down Expand Up @@ -3124,7 +3165,7 @@ unittest
testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);

/* Tests of subset sample (--n|num) field.
/* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
*
* Note: The way these tests are done ensures that subset length does not affect
* output order.
Expand Down Expand Up @@ -3196,7 +3237,46 @@ unittest
fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
}

/* Inorder tests with reservoir sampling via heap (compatibility mode). */
/* Similar tests with the 1x10 data set. */
for (size_t n = data1x10.length + 2; n >= 1; n--)
{
size_t expectedLength = min(data1x10.length, n + 1);
testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
"-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);

testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
"-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);

testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);

testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
"-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
}

/* Simple random sampling with replacement: ensure sample size doesn't change order. */
for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
{
testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
data3x6ExpectedReplaceNum10[0 .. n + 1]);

testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
data3x6ExpectedReplaceNum10[1 .. n + 1]);
}

/* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
{
size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);

testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
"-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);

testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
}

/* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
Expand All @@ -3223,64 +3303,46 @@ unittest
testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1..$]);
testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1..$]);


/* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);

testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1..$]);
testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1..$]);
testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1..$]);
testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1..$]);
testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1..$]);
testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1..$]);
testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1..$]);
testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1..$]);

/* Similar tests with the 1x10 data set. */
for (size_t n = data1x10.length + 2; n >= 1; n--)
{
size_t expectedLength = min(data1x10.length, n + 1);
testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
"-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);

testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
"-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
/* Inorder weighted sampling tests. */
testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);

testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1..$]);
testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1..$]);
testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1..$]);
testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1..$]);
testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1..$]);
testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1..$]);
testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1..$]);

testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);

testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
"-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
}

/* Simple random sampling with replacement: ensure sample size doesn't change order. */
for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
{
testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
data3x6ExpectedReplaceNum10[0 .. n + 1]);

testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
data3x6ExpectedReplaceNum10[1 .. n + 1]);
}

/* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
{
size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);

testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
"-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);

testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
}


/* Distinct sampling tests. */
/*
* Distinct sampling tests.
*/
testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
data5x25ExpectedDistinctK2P40);

Expand Down