Last active
February 26, 2018 06:57
-
-
Save junlincao/74678905a871af2c1ae1982128974595 to your computer and use it in GitHub Desktop.
转换utf-8 3个字节无法表示的字符
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.io.Reader; | |
import java.io.StringReader; | |
import java.io.Writer; | |
/** | |
* 转换utf-8 3个字节无法表示的字符 | |
* | |
* @author CJL | |
* @since 2018-02-26 | |
*/ | |
public class EmojiConverter { | |
private static final char MARKER = '\u0007'; // \u0000 | |
private static final int RADIX = 32; // 保存为字符进制数(建议16或32) | |
private static final int BUF_LEN = 1024; | |
public static String encode(String str) throws IOException { | |
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length()); | |
encode(new StringReader(str), sw); | |
return sw.toString(); | |
} | |
public static String decode(String str) throws IOException { | |
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length()); | |
decode(new StringReader(str), sw); | |
return sw.toString(); | |
} | |
public static String filter(String str) throws IOException { | |
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length()); | |
filter(new StringReader(str), sw); | |
return sw.toString(); | |
} | |
public static void encode(Reader reader, Writer writer) throws IOException { | |
char[] buf = new char[BUF_LEN]; | |
char[] tmp = new char[8]; // 用于存储Integer转换后的字符 | |
int pos = 0; | |
OUTER: | |
do { | |
int readLen = reader.read(buf, pos, buf.length - pos); | |
if (pos == 0 && readLen == -1) { // 没有字符了,退出 | |
break; | |
} | |
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括) | |
pos = 0; | |
while (true) { // 遍历buf中的字符 | |
if (pos >= endPos) { // 已经遍历完了,则退出循环 | |
pos = 0; | |
break; | |
} | |
char c = buf[pos]; | |
if (!Character.isSurrogate(c)) { // 非特殊字符,则直接写入 | |
writer.write(c); | |
pos++; | |
continue; | |
} | |
if (pos == endPos - 1) { // buf中的字符已经到了末尾,不足以组成SurrogatePair,则重新读取缓冲 | |
if (readLen == -1) { // 已经读完了字符串,最后一个字符只能直接写入了 | |
writer.write(c); | |
break OUTER; | |
} else { // 将末尾未读取到的字符放到buf头部去,方便下次填充buf | |
buf[0] = buf[pos]; | |
pos = 1; | |
break; | |
} | |
} | |
char cNext = buf[pos + 1]; | |
if (Character.isSurrogatePair(c, cNext)) { | |
int code = Character.toCodePoint(c, cNext); | |
writer.write(MARKER); | |
int hexLen = EmojiConverterHelper.writeIntChars(code, tmp, RADIX); | |
writer.write(tmp, 0, hexLen); | |
writer.write(MARKER); | |
pos += 2; | |
} else { // 单独的一个Surrogate字符,应该是异常字符! | |
writer.write(c); | |
pos++; | |
} | |
} | |
} while (true); | |
} | |
public static void decode(Reader reader, Writer writer) throws IOException { | |
char[] buf = new char[BUF_LEN]; | |
int pos = 0; | |
do { | |
int readLen = reader.read(buf, pos, buf.length - pos); | |
if (pos == 0 && readLen == -1) { // 没有字符了,退出 | |
break; | |
} | |
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括) | |
pos = 0; | |
while (true) { // 遍历buf中的字符 | |
if (pos >= endPos) { // 已经遍历完了,则退出循环 | |
pos = 0; | |
break; | |
} | |
char c = buf[pos]; | |
if (c != MARKER) { | |
writer.write(c); | |
pos++; | |
continue; | |
} | |
int maxNextMarkerPos = Math.min(endPos, pos + 10); | |
int nextMarkerPos = nextMarkerPos(buf, pos + 1, maxNextMarkerPos); | |
if (nextMarkerPos == -1 || nextMarkerPos - pos > 9) { | |
// 没找到匹配的结束marker | |
if (readLen == -1 || pos == 0 || maxNextMarkerPos < endPos) { | |
writer.write(c); | |
pos++; | |
continue; | |
} | |
// 这种没找到可能是因为读到了buf末尾,需要继续填充buf内容 | |
// 将buf后面未读的数据移动到前面去,等待下次填充buf后重新读取 | |
System.arraycopy(buf, pos, buf, 0, endPos - pos); | |
pos = endPos - pos; | |
break; | |
} | |
try { | |
// 读取两个marker中间的hex字符,转换为相关的unicode字符 | |
int codePoint = EmojiConverterHelper.parseInt(buf, pos + 1, nextMarkerPos, RADIX); | |
writer.write(Character.toChars(codePoint)); | |
pos = nextMarkerPos + 1; | |
} catch (NumberFormatException e) { | |
writer.write(c); | |
pos++; | |
} | |
} | |
} while (true); | |
} | |
/** | |
* 过滤emoji字符,上面方法转换前后的都可以过滤 | |
*/ | |
public static void filter(Reader reader, Writer writer) throws IOException { | |
char[] buf = new char[BUF_LEN]; | |
int pos = 0; | |
do { | |
int readLen = reader.read(buf, pos, buf.length - pos); | |
if (pos == 0 && readLen == -1) { // 没有字符了,退出 | |
break; | |
} | |
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括) | |
pos = 0; | |
while (true) { // 遍历buf中的字符 | |
if (pos >= endPos) { // 已经遍历完了,则退出循环 | |
pos = 0; | |
break; | |
} | |
char c = buf[pos]; | |
if (Character.isSurrogate(c)) { | |
pos++; | |
continue; | |
} | |
if (c != MARKER) { | |
writer.write(c); | |
pos++; | |
continue; | |
} | |
int maxNextMarkerPos = Math.min(endPos, pos + 10); | |
int nextMarkerPos = nextMarkerPos(buf, pos + 1, maxNextMarkerPos); | |
if (nextMarkerPos == -1 || nextMarkerPos - pos > 9) { | |
// 没找到匹配的结束marker | |
if (readLen == -1 || pos == 0 || maxNextMarkerPos < endPos) { | |
writer.write(c); | |
pos++; | |
continue; | |
} | |
// 这种没找到可能是因为读到了buf末尾,需要继续填充buf内容 | |
// 将buf后面未读的数据移动到前面去,等待下次填充buf后重新读取 | |
System.arraycopy(buf, pos, buf, 0, endPos - pos); | |
pos = endPos - pos; | |
break; | |
} | |
try { | |
EmojiConverterHelper.parseInt(buf, pos + 1, nextMarkerPos, RADIX); | |
pos = nextMarkerPos + 1; | |
} catch (NumberFormatException e) { | |
writer.write(c); | |
pos++; | |
} | |
} | |
} while (true); | |
} | |
private static int nextMarkerPos(char[] buf, int fromPos, int endPos) { | |
for (int i = fromPos; i < endPos; i++) { | |
if (buf[i] == MARKER) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
public static void main(String[] args) throws Exception { | |
String str = "你😄\uD83D\uDE03\uD83C\uDDF5\uD83C\uDDFE好"; | |
System.out.println(str); | |
System.out.println(encode(str)); | |
System.out.println(decode(encode(str))); | |
if (!str.equals(decode(encode(str)))) { | |
throw new RuntimeException("not equal!"); | |
} | |
String strErr1 = "A\uD83DB"; | |
System.out.println(encode(strErr1)); | |
String strErr2 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?"; | |
System.out.println(decode(strErr2)); | |
String strErr3 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥\u0007林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?"; | |
System.out.println(decode(strErr3)); | |
String strErr4 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥\u0007666\u0007林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?"; | |
System.out.println(decode(strErr4)); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.io.Writer; | |
/** | |
* 为了减少gc,直接复制修改了Integer类中的部分代码 | |
* | |
* @author CJL | |
* @since 2018-02-26 | |
*/ | |
class EmojiConverterHelper { | |
/** | |
* All possible chars for representing a number as a String | |
*/ | |
private final static char[] digits = { | |
'0', '1', '2', '3', '4', '5', | |
'6', '7', '8', '9', 'a', 'b', | |
'c', 'd', 'e', 'f', 'g', 'h', | |
'i', 'j', 'k', 'l', 'm', 'n', | |
'o', 'p', 'q', 'r', 's', 't', | |
'u', 'v', 'w', 'x', 'y', 'z' | |
}; | |
/** | |
* 将Int转换为指定进制字符串放入buf中 | |
* | |
* @return buf写入长度 | |
*/ | |
static int writeIntChars(int i, char[] buf, int radix) { | |
switch (radix) { | |
case 2: | |
return toUnsignedString0(i, buf, 1); | |
case 4: | |
return toUnsignedString0(i, buf, 2); | |
case 8: | |
return toUnsignedString0(i, buf, 3); | |
case 16: | |
return toUnsignedString0(i, buf, 4); | |
case 32: | |
return toUnsignedString0(i, buf, 5); | |
default: | |
throw new NumberFormatException("Not support radix " + radix); | |
} | |
} | |
/** | |
* modify from {@code Integer.toUnsignedString0(..)} | |
*/ | |
private static int toUnsignedString0(int val, char[] buf, int shift) { | |
// assert shift > 0 && shift <=5 : "Illegal shift value"; | |
int mag = Integer.SIZE - Integer.numberOfLeadingZeros(val); | |
final int chars = Math.max(((mag + (shift - 1)) / shift), 1); | |
int offset = 0; | |
int charPos = chars; | |
int radix = 1 << shift; | |
int mask = radix - 1; | |
do { | |
buf[offset + --charPos] = digits[val & mask]; | |
val >>>= shift; | |
} while (val != 0 && charPos > 0); | |
return chars; | |
} | |
/** | |
* modify from {@code Integer.parseInt(..)} | |
*/ | |
static int parseInt(char[] buf, int fromPos, int toPos, int radix) throws NumberFormatException { | |
if (buf == null) { | |
throw new NumberFormatException("null"); | |
} | |
if (radix < Character.MIN_RADIX) { | |
throw new NumberFormatException("radix " + radix + | |
" less than Character.MIN_RADIX"); | |
} | |
if (radix > Character.MAX_RADIX) { | |
throw new NumberFormatException("radix " + radix + | |
" greater than Character.MAX_RADIX"); | |
} | |
int result = 0; | |
boolean negative = false; | |
int pos = fromPos; | |
int limit = -Integer.MAX_VALUE; | |
int multmin; | |
int digit; | |
if (toPos - fromPos > 0) { | |
char firstChar = buf[fromPos]; | |
if (firstChar < '0') { // Possible leading "+" or "-" | |
if (firstChar == '-') { | |
negative = true; | |
limit = Integer.MIN_VALUE; | |
} else if (firstChar != '+') | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
if (toPos - fromPos == 1) // Cannot have lone "+" or "-" | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
pos++; | |
} | |
multmin = limit / radix; | |
while (pos < toPos) { | |
// Accumulating negatively avoids surprises near MAX_VALUE | |
digit = Character.digit(buf[pos++], radix); | |
if (digit < 0) { | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
} | |
if (result < multmin) { | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
} | |
result *= radix; | |
if (result < limit + digit) { | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
} | |
result -= digit; | |
} | |
} else { | |
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos)); | |
} | |
return negative ? result : -result; | |
} | |
/** | |
* copyed from StringWriter, change StringBuffer to StringBuilder | |
*/ | |
public static class SimpleStringWriter extends Writer { | |
private StringBuilder buf; | |
public SimpleStringWriter() { | |
buf = new StringBuilder(); | |
lock = buf; | |
} | |
public SimpleStringWriter(int initialSize) { | |
if (initialSize < 0) { | |
throw new IllegalArgumentException("Negative buffer size"); | |
} | |
buf = new StringBuilder(initialSize); | |
lock = buf; | |
} | |
public void write(int c) { | |
buf.append((char) c); | |
} | |
public void write(char cbuf[], int off, int len) { | |
if ((off < 0) || (off > cbuf.length) || (len < 0) || | |
((off + len) > cbuf.length) || ((off + len) < 0)) { | |
throw new IndexOutOfBoundsException(); | |
} else if (len == 0) { | |
return; | |
} | |
buf.append(cbuf, off, len); | |
} | |
public void write(String str) { | |
buf.append(str); | |
} | |
public void write(String str, int off, int len) { | |
buf.append(str.substring(off, off + len)); | |
} | |
public SimpleStringWriter append(CharSequence csq) { | |
if (csq == null) | |
write("null"); | |
else | |
write(csq.toString()); | |
return this; | |
} | |
public SimpleStringWriter append(CharSequence csq, int start, int end) { | |
CharSequence cs = (csq == null ? "null" : csq); | |
write(cs.subSequence(start, end).toString()); | |
return this; | |
} | |
public SimpleStringWriter append(char c) { | |
write(c); | |
return this; | |
} | |
public String toString() { | |
return buf.toString(); | |
} | |
public StringBuilder getBuffer() { | |
return buf; | |
} | |
public void flush() { | |
} | |
public void close() throws IOException { | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment