Created
December 15, 2020 14:25
-
-
Save SolomidHero/0e8143567319a73505d092de3020d0e6 to your computer and use it in GitHub Desktop.
Word count (based on map/reduce) using bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Word count of file using map reduce | |
# Args: | |
# - file | |
# - n_workers | |
map() { | |
local file=$1 | |
local from=$2 | |
local to=$3 | |
sed -n "$from, $to p; $to q" $file \ | |
| tr '[:punct:]' ' ' \ | |
| tr -s ' ' '\n' \ | |
| sort \ | |
| uniq -c \ | |
| awk '{print $2" "$1}' | |
} | |
export -f map | |
reduce() { | |
awk 'NF { a[$1] += $2 } END { for (i in a) print i, a[i] }' | |
} | |
export -f reduce | |
# read arguments, define parameters | |
input_file=$1 | |
n_workers=$2 | |
files=(`wc -l $input_file`) | |
n_lines=${files[0]} | |
# create arguments for map function (to parallelize it) | |
segment_size=$(($n_lines / $n_workers)) | |
end=0 | |
map_args=() | |
# fill array with (file, segment_left, segment_right) | |
for i in $(seq 1 $n_workers); do | |
if (($i <= $n_lines % $n_workers)); then | |
addition=1; else addition=0; | |
fi | |
start=$(($end+1)) | |
end=$(($end + $segment_size + $addition)) | |
map_args+=($input_file $start $end) | |
done | |
# parallelize map and reduce, then use single reduce with sorting | |
echo ${map_args[@]} \ | |
| xargs -P $n_workers -n 3 bash -c 'map "$@" | reduce' _ \ | |
| reduce \ | |
| sort -k2nr -k1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment