SolomidHero · December 15, 2020 14:25
diff --git a/wc.sh b/wc.sh
 #!/bin/bash
 # Word count of file using map reduce
 # Args:
 #   - file
 #   - n_workers

 map() {
  local file=$1
  local from=$2
  local to=$3

  sed -n "$from, $to p; $to q" $file \
  | tr '[:punct:]' ' ' \
  | tr -s ' ' '\n' \
  | sort \
  | uniq -c \
  | awk '{print $2" "$1}'
 }
 export -f map

 reduce() {
  awk 'NF { a[$1] += $2 } END { for (i in a) print i, a[i] }'
 }
 export -f reduce

 # read arguments, define parameters
 input_file=$1
 n_workers=$2
 files=(`wc -l $input_file`)
 n_lines=${files[0]}

 # create arguments for map function (to parallelize it)
 segment_size=$(($n_lines / $n_workers))
 end=0
 map_args=()

 # fill array with (file, segment_left, segment_right)
 for i in $(seq 1 $n_workers); do
  if (($i <= $n_lines % $n_workers)); then
    addition=1; else addition=0;
  fi

  start=$(($end+1))
  end=$(($end + $segment_size + $addition))
  map_args+=($input_file $start $end)
 done

 # parallelize map and reduce, then use single reduce with sorting
 echo ${map_args[@]} \
 | xargs -P $n_workers -n 3 bash -c 'map "$@" | reduce' _ \
 | reduce \
 | sort -k2nr -k1
	#!/bin/bash
	# Word count of file using map reduce
	# Args:
	# - file
	# - n_workers

	map() {
	local file=$1
	local from=$2
	local to=$3

	sed -n "$from, $to p; $to q" $file \
	\| tr '[:punct:]' ' ' \
	\| tr -s ' ' '\n' \
	\| sort \
	\| uniq -c \
	\| awk '{print $2" "$1}'
	}
	export -f map

	reduce() {
	awk 'NF { a[$1] += $2 } END { for (i in a) print i, a[i] }'
	}
	export -f reduce

	# read arguments, define parameters
	input_file=$1
	n_workers=$2
	files=(`wc -l $input_file`)
	n_lines=${files[0]}

	# create arguments for map function (to parallelize it)
	segment_size=$(($n_lines / $n_workers))
	end=0
	map_args=()

	# fill array with (file, segment_left, segment_right)
	for i in $(seq 1 $n_workers); do
	if (($i <= $n_lines % $n_workers)); then
	addition=1; else addition=0;
	fi

	start=$(($end+1))
	end=$(($end + $segment_size + $addition))
	map_args+=($input_file $start $end)
	done

	# parallelize map and reduce, then use single reduce with sorting
	echo ${map_args[@]} \
	\| xargs -P $n_workers -n 3 bash -c 'map "$@" \| reduce' _ \
	\| reduce \
	\| sort -k2nr -k1