groboclown · February 19, 2024 04:08
diff --git a/lines-with-unicode-character.sh b/lines-with-unicode-character.sh
 #!/usr/bin/env sh
 # Licensed as CC-0

 # An example script for teaching the shell script how to parse and work with Unicode files.
 # In this example, the code scans the input file for lines that contain a specific Unicode character.
 # The character must be represented as a UTF-32be (big endian) hex string.
 # For example, to search for 'A', use '00000041'

 # This script uses the tools:
 #   common core unix tools: 'tr', 'echo', 'cut' and 'test' (aliased as '[')
 #   'file' with the '--mime-encoding' argument support.
 #   'iconv' to perform the encoding conversion.
 #   'xxd' to perform raw-to-hex and hex-to-raw conversion.

 # You would invoke this script as:
 #  lines-with-unicode-character.sh FILENAME CHARACTER
 # where:
 #  FILENAME is the filename to scan.
 #  CHARACTER is the 8-hex digit, UTF-32 BE encoded character to search for.

 # As an example script, it doesn't have solid error handling or help output.

 # Example usage:
 #   $ curl -o /tmp/UTF-8-demo.txt https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
 #   $ ./lines-with-unicode-character.sh /tmp/UTF-8-demo.txt 00000041

 # Input
 filename="$1"
 search_for="$2"
 # Discover source encoding, which will also be used for output encoding.
 #   - The encoding outputs 'FILENAME: mimetype', so cut is used to strip the start.
 #     Note that, if the filename contains a ':', this will not work right.
 #     It should work right with other characters, like spaces.
 encoding="$( file --mime-encoding "${filename}" | cut -f 2- -d ':' | cut -c 2- )" || exit 1
 # Constants
 eol="0000000a"

 # Local variables
 buff=""
 in_line=0

 # Read the file by:
 #   1. convert the file with 'iconv' from the encoding discovered by 'file' (above).
 #   2. convert the raw binary into hexadecimal with 'xxd'.  This outputs
 #      just a stream of hex octets with no spacing other than an EOL at the column.
 #      The "-g 1" attempts to force by-byte output, so that endian-ness is ignored.
 #   3. strip out whitespace that xxd generates with 'tr'.
 #   4. read 8 characters of hex digits at a time (1 UTF-32 BE character), until the
 #      stream is empty.
 # An extra:
 #      sed 's|........|& |g'
 # could be added after the 'tr' to turn the stream into 8-hex digit character
 # words separated by a space, so that the shell word splitting could be used.
 # However, this example avoids that because it's not necessary.
 iconv -t utf32be -f "${encoding}" < "${filename}" | \
 xxd -g 1 -R never -ps | \
 tr -d '\r\n ' | \
 while IFS='' LANG=C read -r -d '' -n 8 char ; do
    # If the character has already been found, output it now.
    # This helps to also protect against files with no trailing EOL.
    # This also implies that, if the current character is the EOL,
    # then it is also output.
    if [ ${in_line} = 1 ] ; then
        echo "${char}"
    else
        # Add the character to our buffer.
        buff="${buff}${char}"
        # If the character matches what we're looking for, then
        # mark it as found and output the line so-far.
        if [ "${char}" = "${search_for}" ] ; then
            in_line=1
            echo "${buff}"
        fi
    fi

    # On EOL, reset the buffer + found marker.
    if [ "${char}" = "${eol}" ] ; then
        in_line=0
        buff=""
    fi

 # Re-join the output by:
 #  1. strip newlines and spaces with 'tr'.
 #  2. convert the hex stream into binary with 'xxd'
 #  3. convert the raw utf-32be into the encoding.
 done | \
 tr -d '\r\n ' | \
 xxd -g 1 -R never -ps -r | \
 iconv -t "${encoding}" -f utf32be
	#!/usr/bin/env sh
	# Licensed as CC-0

	# An example script for teaching the shell script how to parse and work with Unicode files.
	# In this example, the code scans the input file for lines that contain a specific Unicode character.
	# The character must be represented as a UTF-32be (big endian) hex string.
	# For example, to search for 'A', use '00000041'

	# This script uses the tools:
	# common core unix tools: 'tr', 'echo', 'cut' and 'test' (aliased as '[')
	# 'file' with the '--mime-encoding' argument support.
	# 'iconv' to perform the encoding conversion.
	# 'xxd' to perform raw-to-hex and hex-to-raw conversion.

	# You would invoke this script as:
	# lines-with-unicode-character.sh FILENAME CHARACTER
	# where:
	# FILENAME is the filename to scan.
	# CHARACTER is the 8-hex digit, UTF-32 BE encoded character to search for.

	# As an example script, it doesn't have solid error handling or help output.

	# Example usage:
	# $ curl -o /tmp/UTF-8-demo.txt https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
	# $ ./lines-with-unicode-character.sh /tmp/UTF-8-demo.txt 00000041

	# Input
	filename="$1"
	search_for="$2"
	# Discover source encoding, which will also be used for output encoding.
	# - The encoding outputs 'FILENAME: mimetype', so cut is used to strip the start.
	# Note that, if the filename contains a ':', this will not work right.
	# It should work right with other characters, like spaces.
	encoding="$( file --mime-encoding "${filename}" \| cut -f 2- -d ':' \| cut -c 2- )" \|\| exit 1
	# Constants
	eol="0000000a"

	# Local variables
	buff=""
	in_line=0

	# Read the file by:
	# 1. convert the file with 'iconv' from the encoding discovered by 'file' (above).
	# 2. convert the raw binary into hexadecimal with 'xxd'. This outputs
	# just a stream of hex octets with no spacing other than an EOL at the column.
	# The "-g 1" attempts to force by-byte output, so that endian-ness is ignored.
	# 3. strip out whitespace that xxd generates with 'tr'.
	# 4. read 8 characters of hex digits at a time (1 UTF-32 BE character), until the
	# stream is empty.
	# An extra:
	# sed 's\|........\|& \|g'
	# could be added after the 'tr' to turn the stream into 8-hex digit character
	# words separated by a space, so that the shell word splitting could be used.
	# However, this example avoids that because it's not necessary.
	iconv -t utf32be -f "${encoding}" < "${filename}" \| \
	xxd -g 1 -R never -ps \| \
	tr -d '\r\n ' \| \
	while IFS='' LANG=C read -r -d '' -n 8 char ; do
	# If the character has already been found, output it now.
	# This helps to also protect against files with no trailing EOL.
	# This also implies that, if the current character is the EOL,
	# then it is also output.
	if [ ${in_line} = 1 ] ; then
	echo "${char}"
	else
	# Add the character to our buffer.
	buff="${buff}${char}"
	# If the character matches what we're looking for, then
	# mark it as found and output the line so-far.
	if [ "${char}" = "${search_for}" ] ; then
	in_line=1
	echo "${buff}"
	fi
	fi

	# On EOL, reset the buffer + found marker.
	if [ "${char}" = "${eol}" ] ; then
	in_line=0
	buff=""
	fi

	# Re-join the output by:
	# 1. strip newlines and spaces with 'tr'.
	# 2. convert the hex stream into binary with 'xxd'
	# 3. convert the raw utf-32be into the encoding.
	done \| \
	tr -d '\r\n ' \| \
	xxd -g 1 -R never -ps -r \| \
	iconv -t "${encoding}" -f utf32be