Created
November 10, 2014 13:33
-
-
Save gabriel-laddel/2205ae3e1c54abfee633 to your computer and use it in GitHub Desktop.
this code looked useful - from jaderholem's dotfiles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defun char-utf-8-byte-length (char) | |
(let ((code (char-code char))) | |
(cond ((< code 128) 1) | |
((< code 2048) 2) | |
((< code 65536) 3) | |
(t 4)))) | |
(defmacro as-utf-8-bytes (char writer) | |
"Given a character, calls the writer function for every byte in the | |
encoded form of that character." | |
(let ((char-code (gensym))) | |
`(let ((,char-code (char-code ,char))) | |
(declare (type fixnum ,char-code)) | |
(cond ((< ,char-code 128) | |
(,writer ,char-code)) | |
((< ,char-code 2048) | |
(,writer (logior #b11000000 (ldb (byte 5 6) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code)))) | |
((< ,char-code 65536) | |
(,writer (logior #b11100000 (ldb (byte 4 12) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 6) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code)))) | |
(t | |
(,writer (logior #b11110000 (ldb (byte 3 18) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 12) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 6) ,char-code))) | |
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code)))))))) | |
(defun url-encode (string &optional (to-escape "\"#$%&+,/:;<=>?@")) | |
(declare (optimize speed (safety 0))) | |
(let ((size (loop :for ch :across string :for code := (char-code ch) :sum | |
(cond ((> code 127) (* (char-utf-8-byte-length ch) 3)) | |
((or (< code 33) (find ch to-escape)) 3) | |
(t 1))))) | |
(if (= size (length string)) | |
string | |
(let ((out (make-string size)) (pos 0)) | |
(macrolet ((wr (ch) `(progn (setf (schar out pos) ,ch) (incf pos)))) | |
(flet ((code-out (ch) | |
(multiple-value-bind (hi lo) (floor (char-code ch) 16) | |
(wr #\%) (wr (digit-char hi 16)) (wr (digit-char lo 16))))) | |
(loop :for ch :across string :for code := (char-code ch) :do | |
(cond ((> code 127) (as-utf-8-bytes ch code-out)) | |
((or (< code 33) (find ch to-escape)) (code-out ch)) | |
(t (wr ch)))))) | |
out)))) | |
(defun utf-8-group-size (byte) | |
"Determine the amount of bytes that are part of the character | |
starting with a given byte." | |
(declare (type fixnum byte)) | |
(cond ((zerop (logand byte #b10000000)) 1) | |
((= (logand byte #b11100000) #b11000000) 2) | |
((= (logand byte #b11110000) #b11100000) 3) | |
((= (logand byte #b11111000) #b11110000) 4) | |
(t (error "Invalid UTF-8 byte: 0x~X" byte)))) | |
(defun get-utf-8-character (bytes group-size &aux (start 0)) | |
"Given an array of bytes and the amount of bytes to use, | |
extract the character they denote." | |
(declare (type (simple-array (unsigned-byte 8) (*)) bytes) | |
(type fixnum group-size)) | |
(macrolet ((next-byte () | |
'(prog1 (elt bytes start) | |
(incf start))) | |
(six-bits (byte) | |
(let ((b (gensym))) | |
`(let ((,b ,byte)) | |
(unless (= (logand ,b #b11000000) #b10000000) | |
(error "Invalid byte 0x~X inside a character." ,b)) | |
(ldb (byte 6 0) ,b)))) | |
(test-overlong (byte min-size) | |
(let ((b (gensym))) | |
`(let ((,b ,byte)) | |
(unless (> ,b ,min-size) | |
(error "Overlong UTF-8 byte sequence found.")) | |
,b)))) | |
(ecase group-size | |
(1 (next-byte)) | |
(2 (test-overlong (logior (ash (ldb (byte 5 0) (next-byte)) 6) | |
(six-bits (next-byte))) 128)) | |
(3 (test-overlong (logior (ash (ldb (byte 4 0) (next-byte)) 12) | |
(ash (six-bits (next-byte)) 6) | |
(six-bits (next-byte))) 2048)) | |
(4 (test-overlong (logior (ash (ldb (byte 3 0) (next-byte)) 18) | |
(ash (six-bits (next-byte)) 12) | |
(ash (six-bits (next-byte)) 6) | |
(six-bits (next-byte))) 65536))))) | |
(defun url-decode (string) | |
(declare (optimize speed (safety 0))) | |
(let ((buf (make-string (length string))) | |
(utf-buf (make-array 4 :element-type '(unsigned-byte 8)))) | |
(with-input-from-string (in string) | |
(loop :for pos :from 0 :for ch := (read-char in nil nil) :while ch :do | |
(macrolet ((hex () | |
'(let ((big (digit-char-p (read-char in nil #\x) 16)) | |
(small (digit-char-p (read-char in nil #\x) 16))) | |
(unless (and big small) (error "Junk in URL.")) | |
(+ small (ash big 4))))) | |
(setf (schar buf pos) | |
(case ch | |
(#\+ #\space) | |
(#\% (let* ((code (hex)) | |
(group (utf-8-group-size code))) | |
(setf (aref utf-buf 0) code) | |
(loop :for i :from 1 :below group :do | |
(unless (eql (read-char in nil nil) #\%) | |
(error "Nonsense UTF-8 code in URL.")) | |
(setf (aref utf-buf i) (hex))) | |
(get-utf-8-character utf-buf group))) | |
(t ch)))) | |
:finally (return (if (eql pos (length buf)) buf (subseq buf 0 pos))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment