对于生僻字的处理可以用到下面相关编码
utf8
或gb18030
)转换成 gbk伪码 的形式来表示,/**
* 生僻字工具类
*
* @author xdr630
*/
public final class RareCharacterUtils {
/**
* 生僻字前缀
*/
private static final String HEX_PREFIX = "`H";
private static final Charset GB18030 = Charset.forName("gb18030");
private RareCharacterUtils() {
}
/**
* 判断字符串中是否有生僻字
*
* @param text 字符串
* @return 结果
*/
public static boolean containsRareCharacter(String text) {
for (int i = 0; i < text.length(); i++) {
int c = text.codePointAt(i);
if (isRareCharacter(c)) {
// 发现生僻字,返回 true
return true;
}
//判断是否包含伪码标识, 包含`H,且后续跟进至少两位16进制
if (c == '`' && i < text.length() - 3) {
int next = text.codePointAt(i + 1);
if (next == 'H') {
return true;
}
}
}
// 没有发现生僻字,返回 false
return false;
}
/**
* 判断是否包含生僻字符.
*
* @param c unicode代码点
* @return boolean
*/
private static boolean isRareCharacter(int c) {
// 康熙部首、基本汉字补、表情符号?、CJK EXT-B~F、CJK兼容扩展、CJK EXT-G
if ((c >= 0x2F00 && c <= 0x2FDF) || (c >= 0x9FA6 && c <= 0x9FEF) || (c >= 0x10000 && c <= 0x1FFFF) ||
(c >= 0x20000 && c <= 0x2F7FF) || (c >= 0x2F800 && c <= 0x2FA1F) || (c >= 0x30000 && c <= 0x3FFFF)) {
return true;
}
// 非生僻字
if (c <= 0x7F || c == 0x00B0 || c == 0x00B1 || c == 0x00B7 || (c >= 0x3000 && c <= 0x3003) ||
(c >= 0x3005 && c <= 0x3017) || (c >= 0x4E00 && c <= 0x9FA5) || (c >= 0xFF01 && c <= 0xFF5E)) {
return false;
}
// unicode理论最大值U+10FFFF
return c <= 0x10FFFF;
}
/**
* 字符转gbk伪码,前缀加 `H 标识符
*
* @param text 字符串
* @return gbk伪码
*/
public static String utf8ToGbk(String text) {
return charsetToPseudocode(StandardCharsets.UTF_8, text);
}
/**
* 字符转gbk伪码,前缀加 `H 标识符
*
* @param text 字符串
* @return gbk伪码
*/
public static String gb18030ToGbk(String text) {
return charsetToPseudocode(GB18030, text);
}
/**
* gbk伪码还原成utf8
*
* @param input gbk伪码字符串
* @return utf8编码字符串
*/
public static String gbkToUtf8(String input) {
return gbkPseudocodeToUtf8OrGb18030(input);
}
/**
* gbk伪码还原成gb18030
*
* @param input gbk伪码字符串
* @return gb18030编码字符串
*/
public static String gbkToGb18030(String input) {
return gbkPseudocodeToUtf8OrGb18030(input);
}
/**
* 字符转gbk伪码,前缀加 `H 标识符.
*
* @param charset 字符集
* @param inputString 字符串
* @return gbk伪码
*/
private static String charsetToPseudocode(Charset charset, String inputString) {
if (!charset.canEncode()) {
throw new IllegalArgumentException("not support charset: " + charset.name());
}
int cpCount = inputString.codePointCount(0, inputString.length());
StringBuilder sb = new StringBuilder();
for (int index = 0; index < cpCount; ++index) {
// 获取当前字符的起始位置和下一个字符的位置
int i = inputString.offsetByCodePoints(0, index);
int nextI = inputString.offsetByCodePoints(0, index + 1);
// 获取当前字符的代码点
int codepoint = inputString.codePointAt(i);
// 获取当前字符
String str = inputString.substring(i, nextI);
if (containsRareCharacter(str)) {
// 如果当前字符是生僻字,转换为伪码,并加上前缀
String s = Integer.toHexString(codepoint).toUpperCase();
if (s.length() < 5) {
// 如果unicode码小于5位,则补足5位
s = "0000".substring(0, 5 - s.length()) + s;
}
sb.append(HEX_PREFIX).append(s);
} else {
sb.append(str);
}
}
return sb.toString();
}
/**
* gbk伪码转utf8或gb18030
*
* @param gbkPseudocode gbk伪码字符串
* @return utf8或gb18030字符串
*/
private static String gbkPseudocodeToUtf8OrGb18030(String gbkPseudocode) {
StringBuilder sb = new StringBuilder();
int length = gbkPseudocode.length();
int index = 0;
while (index < length) {
char ch = gbkPseudocode.charAt(index);
if (ch != '`') {
// 非前缀字符,直接添加到结果中
sb.append(ch);
index++;
continue;
}
// 如果为反引号字符就+1跳过
index++;
if (index >= length || gbkPseudocode.charAt(index) != 'H') {
// 非生僻字前缀字符,将反引号添加到结果中
sb.append('`');
if (index < length) {
sb.append(gbkPseudocode.charAt(index));
index++;
}
continue;
}
// 前缀字符 `H` 表示生僻字
index++;
StringBuilder hexCode = new StringBuilder();
if (index < length) {
char nextCh = gbkPseudocode.charAt(index);
if (Character.isLetterOrDigit(nextCh)) {
// 解析生僻字的十六进制编码
while (index < length && isHexCode(nextCh)) {
hexCode.append(nextCh);
// 继续解析下一个字符
index++;
if (index < length) {
nextCh = gbkPseudocode.charAt(index);
}
// 当unicode长度达到4时,解析判断是否为生僻字
if (hexCode.length() < 4) {
continue;
}
// 解决2CC5 'ⳅ'与2CC56 '' 二义性冲突的问题,优先按5个字节逻辑处理
if (hexCode.length() == 4 && index < length && isHexCode(nextCh)) {
continue;
}
int codepoint = Integer.parseInt(hexCode.toString(), 16);
if (isRareCharacter(codepoint)) {
//判断伪码unicode串是否为为生僻字, 是则转成生僻字字符输出
sb.append(Character.toChars(codepoint));
hexCode = null;
break;
}
}
}
}
if (hexCode != null) {
sb.append("`H").append(hexCode);
}
}
return sb.toString();
}
private static boolean isHexCode(char ch) {
return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
}
}
public static boolean containsRareCharacter(String text)
// 生僻字
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("䶮")); true
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("")); true
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("崔龿")); true
// 非生僻字
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("!@#$%^&*()_+-=[]{}\\|;':\",./<>?")); false
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("!@#$%^&*()_+-=[]\{}|;':",./<>?")); false
Assertions.assertTrue(RareChineseUtils.containsRareCharacter("你好")); false
public static String utf8ToGbk(String text)
hexStringToBytes:将一个十六进制字符串转换为字节数组的方法,将每两个十六进制字符解析成一个字节,并将其存储在字节数组中返回。
String c1 = utf8ToGbk("张。,,.");
String c2 = utf8ToGbk("你好");
Assertions.assertEquals("张`H2CC56。,,.", c1);
Assertions.assertEquals("你好", c2);
String str = new String(hexStringToBytes("E5B494E9BEBF"), StandardCharsets.UTF_8);
System.out.println(str.getBytes());
Assertions.assertEquals("崔龿", str);
Assertions.assertEquals("崔`H9FBF", utf8ToGbk(str));
public static String gb18030ToGbk(String text)
String str = new String(hexStringToBytes("B4DE82359138"), GB18030);
Assertions.assertEquals("崔龿", str);
Assertions.assertEquals("崔`H9FBF", gb18030ToGbk(str));
public static String gbkToUtf8(String input)
bytesToHex方法是将字节数组转换为十六进制字符串
String s1 = gbkToUtf8("张`H2CC56。,,.");
String s2 = gbkToUtf8("你好");
Assertions.assertEquals("张。,,.", s1);
Assertions.assertEquals("你好", s2);
String s = gbkToUtf8("崔`H9FBF");
Assertions.assertEquals("E5B494E9BEBF", bytesToHex(s.getBytes(StandardCharsets.UTF_8)));
public static String gbkToGb18030(String input)
String s1 = gbkToGb18030("张`H2CC56。,,.");
String s2 = gbkToGb18030("你好");
Assertions.assertEquals("张。,,.", s1);
Assertions.assertEquals("你好", s2);
String s = gbkToUtf8("崔`H9FBF");
Assertions.assertEquals("B4DE82359138", bytesToHex(s.getBytes(GB18030)));