Last active
February 19, 2024 04:08
-
-
Save groboclown/f3be4f70de82f6bee3a61553b61d7ff1 to your computer and use it in GitHub Desktop.
Teach A Shell Script How To Parse Unicode Files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
# Licensed as CC-0 | |
# An example script for teaching the shell script how to parse and work with Unicode files. | |
# In this example, the code scans the input file for lines that contain a specific Unicode character. | |
# The character must be represented as a UTF-32be (big endian) hex string. | |
# For example, to search for 'A', use '00000041' | |
# This script uses the tools: | |
# common core unix tools: 'tr', 'echo', 'cut' and 'test' (aliased as '[') | |
# 'file' with the '--mime-encoding' argument support. | |
# 'iconv' to perform the encoding conversion. | |
# 'xxd' to perform raw-to-hex and hex-to-raw conversion. | |
# You would invoke this script as: | |
# lines-with-unicode-character.sh FILENAME CHARACTER | |
# where: | |
# FILENAME is the filename to scan. | |
# CHARACTER is the 8-hex digit, UTF-32 BE encoded character to search for. | |
# As an example script, it doesn't have solid error handling or help output. | |
# Example usage: | |
# $ curl -o /tmp/UTF-8-demo.txt https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt | |
# $ ./lines-with-unicode-character.sh /tmp/UTF-8-demo.txt 00000041 | |
# Input | |
filename="$1" | |
search_for="$2" | |
# Discover source encoding, which will also be used for output encoding. | |
# - The encoding outputs 'FILENAME: mimetype', so cut is used to strip the start. | |
# Note that, if the filename contains a ':', this will not work right. | |
# It should work right with other characters, like spaces. | |
encoding="$( file --mime-encoding "${filename}" | cut -f 2- -d ':' | cut -c 2- )" || exit 1 | |
# Constants | |
eol="0000000a" | |
# Local variables | |
buff="" | |
in_line=0 | |
# Read the file by: | |
# 1. convert the file with 'iconv' from the encoding discovered by 'file' (above). | |
# 2. convert the raw binary into hexadecimal with 'xxd'. This outputs | |
# just a stream of hex octets with no spacing other than an EOL at the column. | |
# The "-g 1" attempts to force by-byte output, so that endian-ness is ignored. | |
# 3. strip out whitespace that xxd generates with 'tr'. | |
# 4. read 8 characters of hex digits at a time (1 UTF-32 BE character), until the | |
# stream is empty. | |
# An extra: | |
# sed 's|........|& |g' | |
# could be added after the 'tr' to turn the stream into 8-hex digit character | |
# words separated by a space, so that the shell word splitting could be used. | |
# However, this example avoids that because it's not necessary. | |
iconv -t utf32be -f "${encoding}" < "${filename}" | \ | |
xxd -g 1 -R never -ps | \ | |
tr -d '\r\n ' | \ | |
while IFS='' LANG=C read -r -d '' -n 8 char ; do | |
# If the character has already been found, output it now. | |
# This helps to also protect against files with no trailing EOL. | |
# This also implies that, if the current character is the EOL, | |
# then it is also output. | |
if [ ${in_line} = 1 ] ; then | |
echo "${char}" | |
else | |
# Add the character to our buffer. | |
buff="${buff}${char}" | |
# If the character matches what we're looking for, then | |
# mark it as found and output the line so-far. | |
if [ "${char}" = "${search_for}" ] ; then | |
in_line=1 | |
echo "${buff}" | |
fi | |
fi | |
# On EOL, reset the buffer + found marker. | |
if [ "${char}" = "${eol}" ] ; then | |
in_line=0 | |
buff="" | |
fi | |
# Re-join the output by: | |
# 1. strip newlines and spaces with 'tr'. | |
# 2. convert the hex stream into binary with 'xxd' | |
# 3. convert the raw utf-32be into the encoding. | |
done | \ | |
tr -d '\r\n ' | \ | |
xxd -g 1 -R never -ps -r | \ | |
iconv -t "${encoding}" -f utf32be |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment