按指定的字符数量拆分UTF-8字符串(Java C#)

根据指定的字符数量拆分UTF8字符,标点符号、英文、中文、特殊符号均算一个字符,分为Java和C#版本,用途例如发送超长短信。

核心思想

通过查表法获取UTF8单个字符占用的字节数量。

Java 版本

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

public class Utf8Split {
    /**
     * UTF-8字符长度表
     * UTF-8编码规则:如果只有一个字节则其最高二进制位为0;如果是多字节,其第一个字节从最高位开始,连续的二进制位值为1的个数决定了其编码的字节数,其余各字节均以10开头。
     */
    public static final byte[] UTF8_LOOK_FOR_TABLE = {
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
    };

    public static void main(String[] args) {
        try {
            String msg = "7月6日20时07分,杭州市公安局江干区分局接群众求助,称三堡北苑居民来某某于7月5日凌晨失踪。";
            byte[] bData = msg.getBytes("UTF-8");
            List<String> vals = split(bData, 6);
            for(String item : vals)
                System.out.println(item);
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }

    /**
     * 根据指定的字符数量拆分UTF8字符
     * @param utf8Data          UTF-8编码的字节数组
     * @param splitCharCount    拆分后每一段的字符数量。注意:标点符号、英文、中文、特殊符号均算一个字符
     * @return  拆分后的字符数组
     * @throws UnsupportedEncodingException
     */
    private static List<String> split(byte[] utf8Data, int splitCharCount)
            throws UnsupportedEncodingException {
        List<String> vals = new ArrayList<>();
        int bLen = utf8Data.length;
        byte[] newVal;
        byte current;
        int i = 0, offset = 0, size = 0, curLen = 0, charCount = 0;

        for (; i < bLen; ++charCount) {
            current = utf8Data[i];
            curLen = UTF8_LOOK_FOR_TABLE[(0xFF & current)];
            i += curLen;
            if (charCount >= splitCharCount) {
                size = i - offset;
                newVal = new byte[size];
                System.arraycopy(utf8Data, offset, newVal, 0, size);
                vals.add(new String(newVal, "UTF-8"));
                offset = i;
                charCount = 0;
            }
        }

        size = i - offset;
        if( size > 0 ) {
            newVal = new byte[size];
            System.arraycopy(utf8Data, offset, newVal, 0, size);
            vals.add(new String(newVal, "UTF-8"));
        }
        return vals;
    }
}

C# 版本

using System;
using System.Collections.Generic;
using System.Text;

namespace KMCB {
    class Program {

        /// 
        /// UTF-8字符长度表
        /// UTF-8编码规则:如果只有一个字节则其最高二进制位为0;如果是多字节,其第一个字节从最高位开始,连续的二进制位值为1的个数决定了其编码的字节数,其余各字节均以10开头。
        /// 
        private static byte[] UTF8_LOOK_FOR_TABLE = {
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
        };

        static void Main(string[] args) {
            try {
                String msg = "7月6日20时07分,杭州市公安局江干区分局接群众求助,称三堡北苑居民来某某于7月5日凌晨失踪。";
                byte[] bData = Encoding.Default.GetBytes(msg);
                byte[] bUTF8 = Encoding.Convert(Encoding.Default, Encoding.UTF8, bData);
                IList vals = split(bUTF8, 6);
                foreach (String item in vals)
                    Console.WriteLine(item);
            } catch (Exception e) {
                Console.WriteLine(e.Message);
            }
        }

        /// 
        /// 根据指定的字符数量拆分UTF8字符
        /// 
        ///  UTF-8编码的字节数组
        /// 拆分后每一段的字符数量。注意:标点符号、英文、中文、特殊符号均算一个字符
        /// 拆分后的字符串数组
        private static IList split(byte[] utf8Data, int splitCharCount) {
            IList vals = new List();
            int bLen = utf8Data.Length;
            byte[] newVal;
            byte current;
            int i = 0, offset = 0, size = 0, curLen = 0, charCount = 0;

            for (; i < bLen; ++charCount) {
                current = utf8Data[i];
                curLen = UTF8_LOOK_FOR_TABLE[(0xFF & current)];
                i += curLen;
                if (charCount >= splitCharCount) {
                    size = i - offset;
                    newVal = new byte[size];
                    Array.Copy(utf8Data, offset, newVal, 0, size);
                    vals.Add(Encoding.UTF8.GetString(newVal));
                    offset = i;
                    charCount = 0;
                }
            }

            size = i - offset;
            if (size > 0) {
                newVal = new byte[size];
                Array.Copy(utf8Data, offset, newVal, 0, size);
                vals.Add(Encoding.UTF8.GetString(newVal));
            }
            return vals;
        }
    }
}

你可能感兴趣的:(C#)