Last active
August 29, 2015 14:10
-
-
Save gpfreitas/a8eaf31201ba3a2ab1b0 to your computer and use it in GitHub Desktop.
Test operations with NULL, NaN and Infinity in Pig
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
USAGE='NAME | |
pig_nan_null_inf.sh -- Test operations with NULL, NaN and Infinity | |
SYNOPSIS | |
./pig_nan_null_inf.sh | |
DESCRIPTION | |
By running this script in a computer with Apache Pig installed, you should | |
obtain two files: arithmetic.tsv and comparisons.tsv. These files | |
contain tab-separated records that tabulate operations between values | |
of `a` and `b` when `a` and `b` are not only regular numbers, but also | |
NaN, NULL and Infinity. The first line of each file is a header with | |
the column names. | |
My arithemtic.tsv under Pig 0.13 looks like this (pretty printed) | |
|------------+-----------+-----------+-----------+-----------+----------------| | |
| a | b | a_plus_b | a_minus_b | a_times_b | a_dividedby_b | | |
|------------+-----------+-----------+-----------+-----------+----------------| | |
| -Infinity | -Infinity | -Infinity | NaN | Infinity | NaN | | |
| -Infinity | 0.0 | -Infinity | -Infinity | NaN | | | |
| -Infinity | NaN | NaN | NaN | NaN | NaN | | |
| -Infinity | | | | | | | |
| 0.0 | -Infinity | -Infinity | Infinity | NaN | -0.0 | | |
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | | | |
| 0.0 | NaN | NaN | NaN | NaN | NaN | | |
| 0.0 | | | | | | | |
| NaN | -Infinity | NaN | NaN | NaN | NaN | | |
| NaN | 0.0 | NaN | NaN | NaN | | | |
| NaN | NaN | NaN | NaN | NaN | NaN | | |
| NaN | | | | | | | |
| | -Infinity | | | | | | |
| | 0.0 | | | | | | |
| | NaN | | | | | | |
| | | | | | | | |
|------------+-----------+-----------+-----------+-----------+----------------| | |
My comparisons.tsv under Pig 0.13 looks like this (pretty printed) | |
|------------+-----------+---------+--------+--------+---------+--------+----------| | |
| a | b | a_geq_b | a_gt_b | a_lt_b | a_leq_b | a_eq_b | a_neq_b | | |
|------------+-----------+---------+--------+--------+---------+--------+----------| | |
| -Infinity | -Infinity | true | false | false | true | true | false | | |
| -Infinity | 0.0 | false | false | true | true | false | true | | |
| -Infinity | NaN | false | false | true | true | false | true | | |
| -Infinity | | | | | | | | | |
| 0.0 | -Infinity | true | true | false | false | false | true | | |
| 0.0 | 0.0 | true | false | false | true | true | false | | |
| 0.0 | NaN | false | false | true | true | false | true | | |
| 0.0 | | | | | | | | | |
| NaN | -Infinity | true | true | false | false | false | true | | |
| NaN | 0.0 | true | true | false | false | false | true | | |
| NaN | NaN | true | false | false | true | true | false | | |
| NaN | | | | | | | | | |
| | -Infinity | | | | | | | | |
| | 0.0 | | | | | | | | |
| | NaN | | | | | | | | |
| | | | | | | | | | |
|------------+-----------+---------+--------+--------+---------+--------+----------| | |
NOTES | |
Pretty printed tables produced with csvlook, from the csvkit Python | |
package. The shell command column -t might not format those tables | |
correctly (mine did not).' | |
# Usage | |
if [ $# -gt 0 ] | |
then | |
echo "$USAGE" | |
exit 1 | |
fi | |
# Generate data: we use these values because in Pig (in fact in most other | |
# languages) | |
# | |
# log(0) = -Infinity | |
# log(1) = 0.0 | |
# log(-1) = NaN | |
# | |
# and because we also want to test NULL values. | |
cat > data.tsv <<EOD | |
0 | |
1 | |
-1 | |
NULL | |
EOD | |
# Run the actual Pig script | |
pig -x local <<EOP | |
a = LOAD 'data.tsv' as (x: double); | |
b = LOAD 'data.tsv' as (x: double); | |
c = cross a, b; | |
d = foreach c generate a::x as a, b::x as b; | |
e = foreach d generate LOG(a) as a, LOG(b) as b; | |
arithmetic = foreach e | |
generate a, | |
b, | |
a + b as a_plus_b, | |
a - b as a_minus_b, | |
a * b as a_times_b, | |
a / b as a_dividedby_b; | |
comparisons = foreach e | |
generate a, | |
b, | |
(boolean) ((a >= b) ? 1 : 0) as a_geq_b, | |
(boolean) ((a > b) ? 1 : 0) as a_gt_b, | |
(boolean) ((a < b) ? 1 : 0) as a_lt_b, | |
(boolean) ((a <= b) ? 1 : 0) as a_leq_b, | |
(boolean) ((a == b) ? 1 : 0) as a_eq_b, | |
(boolean) ((a != b) ? 1 : 0) as a_neq_b; | |
store arithmetic into 'arithmetic' using PigStorage('\t', '-schema'); | |
store comparisons into 'comparisons' using PigStorage('\t', '-schema'); | |
EOP | |
# Create the output files with their headers | |
for rel in "arithmetic" "comparisons" | |
do | |
cat "$rel"/.pig_header "$rel"/part* > "$rel".tsv | |
done | |
echo | |
echo | |
echo "--> Created files arithmetic.tsv and comparisons.tsv" 1>&2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note, on line 57 that
NaN == NaN
evalutes totrue
.