根据指定的字符数量拆分UTF8字符,标点符号、英文、中文、特殊符号均算一个字符,分为Java和C#版本,用途例如发送超长短信。
通过查表法获取UTF8单个字符占用的字节数量。
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
public class Utf8Split {
/**
* UTF-8字符长度表
* UTF-8编码规则:如果只有一个字节则其最高二进制位为0;如果是多字节,其第一个字节从最高位开始,连续的二进制位值为1的个数决定了其编码的字节数,其余各字节均以10开头。
*/
public static final byte[] UTF8_LOOK_FOR_TABLE = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
public static void main(String[] args) {
try {
String msg = "7月6日20时07分,杭州市公安局江干区分局接群众求助,称三堡北苑居民来某某于7月5日凌晨失踪。";
byte[] bData = msg.getBytes("UTF-8");
List<String> vals = split(bData, 6);
for(String item : vals)
System.out.println(item);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
/**
* 根据指定的字符数量拆分UTF8字符
* @param utf8Data UTF-8编码的字节数组
* @param splitCharCount 拆分后每一段的字符数量。注意:标点符号、英文、中文、特殊符号均算一个字符
* @return 拆分后的字符数组
* @throws UnsupportedEncodingException
*/
private static List<String> split(byte[] utf8Data, int splitCharCount)
throws UnsupportedEncodingException {
List<String> vals = new ArrayList<>();
int bLen = utf8Data.length;
byte[] newVal;
byte current;
int i = 0, offset = 0, size = 0, curLen = 0, charCount = 0;
for (; i < bLen; ++charCount) {
current = utf8Data[i];
curLen = UTF8_LOOK_FOR_TABLE[(0xFF & current)];
i += curLen;
if (charCount >= splitCharCount) {
size = i - offset;
newVal = new byte[size];
System.arraycopy(utf8Data, offset, newVal, 0, size);
vals.add(new String(newVal, "UTF-8"));
offset = i;
charCount = 0;
}
}
size = i - offset;
if( size > 0 ) {
newVal = new byte[size];
System.arraycopy(utf8Data, offset, newVal, 0, size);
vals.add(new String(newVal, "UTF-8"));
}
return vals;
}
}
using System;
using System.Collections.Generic;
using System.Text;
namespace KMCB {
class Program {
///
/// UTF-8字符长度表
/// UTF-8编码规则:如果只有一个字节则其最高二进制位为0;如果是多字节,其第一个字节从最高位开始,连续的二进制位值为1的个数决定了其编码的字节数,其余各字节均以10开头。
///
private static byte[] UTF8_LOOK_FOR_TABLE = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
static void Main(string[] args) {
try {
String msg = "7月6日20时07分,杭州市公安局江干区分局接群众求助,称三堡北苑居民来某某于7月5日凌晨失踪。";
byte[] bData = Encoding.Default.GetBytes(msg);
byte[] bUTF8 = Encoding.Convert(Encoding.Default, Encoding.UTF8, bData);
IList vals = split(bUTF8, 6);
foreach (String item in vals)
Console.WriteLine(item);
} catch (Exception e) {
Console.WriteLine(e.Message);
}
}
///
/// 根据指定的字符数量拆分UTF8字符
///
/// UTF-8编码的字节数组
/// 拆分后每一段的字符数量。注意:标点符号、英文、中文、特殊符号均算一个字符
/// 拆分后的字符串数组
private static IList split(byte[] utf8Data, int splitCharCount) {
IList vals = new List();
int bLen = utf8Data.Length;
byte[] newVal;
byte current;
int i = 0, offset = 0, size = 0, curLen = 0, charCount = 0;
for (; i < bLen; ++charCount) {
current = utf8Data[i];
curLen = UTF8_LOOK_FOR_TABLE[(0xFF & current)];
i += curLen;
if (charCount >= splitCharCount) {
size = i - offset;
newVal = new byte[size];
Array.Copy(utf8Data, offset, newVal, 0, size);
vals.Add(Encoding.UTF8.GetString(newVal));
offset = i;
charCount = 0;
}
}
size = i - offset;
if (size > 0) {
newVal = new byte[size];
Array.Copy(utf8Data, offset, newVal, 0, size);
vals.Add(Encoding.UTF8.GetString(newVal));
}
return vals;
}
}
}