Last active
December 17, 2015 19:29
-
-
Save fireblue/5660870 to your computer and use it in GitHub Desktop.
Detect whether the content of a NSData object is UTF8 encoded or not.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- (BOOL)isUTF8 | |
{ | |
const char *string = (const char *)self.bytes; | |
const unsigned char * bytes = (const unsigned char *)string; | |
while(*bytes) | |
{ | |
if( (// ASCII | |
// use bytes[0] <= 0x7F to allow ASCII control characters | |
bytes[0] == 0x09 || | |
bytes[0] == 0x0A || | |
bytes[0] == 0x0D || | |
(0x20 <= bytes[0] && bytes[0] <= 0x7E) | |
) | |
) { | |
bytes += 1; | |
continue; | |
} | |
if( (// non-overlong 2-byte | |
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) && | |
(0x80 <= bytes[1] && bytes[1] <= 0xBF) | |
) | |
) { | |
bytes += 2; | |
continue; | |
} | |
if( (// excluding overlongs | |
bytes[0] == 0xE0 && | |
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) | |
) || | |
(// straight 3-byte | |
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || | |
bytes[0] == 0xEE || | |
bytes[0] == 0xEF) && | |
(0x80 <= bytes[1] && bytes[1] <= 0xBF) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) | |
) || | |
(// excluding surrogates | |
bytes[0] == 0xED && | |
(0x80 <= bytes[1] && bytes[1] <= 0x9F) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) | |
) | |
) { | |
bytes += 3; | |
continue; | |
} | |
if( (// planes 1-3 | |
bytes[0] == 0xF0 && | |
(0x90 <= bytes[1] && bytes[1] <= 0xBF) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) && | |
(0x80 <= bytes[3] && bytes[3] <= 0xBF) | |
) || | |
(// planes 4-15 | |
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) && | |
(0x80 <= bytes[1] && bytes[1] <= 0xBF) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) && | |
(0x80 <= bytes[3] && bytes[3] <= 0xBF) | |
) || | |
(// plane 16 | |
bytes[0] == 0xF4 && | |
(0x80 <= bytes[1] && bytes[1] <= 0x8F) && | |
(0x80 <= bytes[2] && bytes[2] <= 0xBF) && | |
(0x80 <= bytes[3] && bytes[3] <= 0xBF) | |
) | |
) { | |
bytes += 4; | |
continue; | |
} | |
return 0; | |
} | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment