Last active
November 3, 2022 02:59
-
-
Save kristopolous/4b0627e0962f2216c31161796304833a to your computer and use it in GitHub Desktop.
top-10 URIs in the apache log in pure bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This is a very crazy 1-liner that only uses builtins | |
#set -o noglob;declare -a mapper=() count=();exec 3<access.log; while read -a lines -u 3; do [[ ${lines[5]} != '"GET' ]] && continue; page="${lines[6]}"; ix=0;for i in "${mapper[@]}"; do [[ "$page" = "$i" ]] && break; (( ix++ )); done ; if [[ "$page" = "$i" ]]; then (( count[$ix] ++ )); else mapper+=( $page ); count[$ix]=1; fi; done; (( max=-1, winner=-1, show=10 )); while (( show > 0 )); do ix=0;for i in "${count[@]}"; do (( i > max )) && (( max=i, winner=ix )) ;(( ix++ ));done; builtin echo -e "${count[winner]}\t${mapper[$winner]}";(( count[$winner]=-1, max=0, show-- )); done; | |
# Here's a formatted nice version | |
# We need to make sure that bash doesn't do weird substitution | |
set -o noglob | |
declare -a mapper=() count=() | |
# And to avoid cat we do file descriptors | |
exec 3<access.log | |
# We read it in as an array with the default IFS of ' ' | |
while read -a lines -u 3; do | |
# We only look at GETs | |
[[ ${lines[5]} != '"GET' ]] && continue | |
page="${lines[6]}" | |
ix=0 | |
# Now we do a linear search through things we've seen before | |
for i in "${mapper[@]}"; do | |
# If we see it that's great, break out | |
[[ "$page" = "$i" ]] && break | |
(( ix++ )) | |
done | |
# We see if we broke out because we found it, if so, | |
# increment our counter | |
if [[ "$page" = "$i" ]]; then | |
(( count[$ix] ++ )) | |
else | |
# otherwise, append it to the end of our seen pages | |
mapper+=( $page ) | |
count[$ix]=1 | |
fi | |
done | |
# cool now we sort | |
# we are doing a linear search for the max value | |
(( winner=-1, max=-1, show=10 )) | |
while (( $show > 0 )); do | |
ix=0 | |
for i in "${count[@]}"; do | |
(( i > max )) && (( max=i, winner=ix )) | |
(( ix++ )) | |
done | |
# When we find it we emit the count and the URI to standard out | |
builtin echo -e "${count[winner]}\t${mapper[$winner]}" | |
# And we blow away our count of it to -1 so we don't hit it again | |
# We decrement our counter, reset our max and then do another pass | |
(( count[$winner] = -1, max = -1, show-- )) | |
done |
I was trying to think of an elegant speedup with minimal code. There's trivial things you can do to avoid the multipass sort (keep track of the top 10 and not do 10 passes) but yawn, unimpressed.
You can do weighted swapping during the insertion phase as well, another big yawn.
They both suck
I'm looking for something really innovative. For instance set shows things in alphabetical order, if you could cleverly encode your values to leverage that then you get sort for free.
Lol i know how. Problem is you'll want grep
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Backstory: My friend sent me this
I thought "python!? I don't even need sort. Hell, I don't need cat"