我们native层配置文件用了两套编码,unicode和gbk2312,都是两个字节表示中文。要统一采用gbk编码,顺便顺便熟悉一下gbk。
为方便使用python做测试。python默认的字符串类str采用unicode,s = '中文' 等价于 s = u'中文'。
dp_gbk_bs将字符串转化成gbk编码,并输出每个中文对应的unsigned short。
wd_2_gbk_val 将单个字符转化成gbk编码的unsigned short值。
gbk_val_2_wd 将gbk编码下的一个short数值还原成中文字符串。
# function 0 # 输出字符串中每个汉字码值 def dp_gbk_bs(name): ''' dump gbk2312 string to unsigned short vector. ''' gbk_bs = str.encode(name, 'gbk') print( name, "(gbk2312) = ", gbk_bs) size = len(name) form = '' for i in range(size): form += 'H' vec = struct.unpack(form, gbk_bs) # return a tuple # v1, v2 = struct.unpack('HH', gbk_bs) for i in range(size): print(name[i], " = ", vec[i]) # function 1 def wd_2_gbk_val(wd): ''' transform single gbk2312 word to unsigned short value. 中 = 53462 , 0xd0d6 ''' cd_ty = 'gbk' gbk_bt = str.encode(wd, cd_ty); vec = struct.unpack('H', gbk_bt) # return tuple type. return vec[0] ss = u'中国技术交易大厦' dp_gbk_bs(ss) # function 2 # short 转成hex string,然后转化成bytes,最后转化成str(gbk2312) def gbk_val_2_wd(us_v): ''' transform unsigned short value to gbk2312 single word. 53462 , 0xd0d6 => 中 ''' hex_s = hex(us_v) # 获取16进制串 print('hex str = ', hex_s) hex_s = hex_s[2:] # skip '0x' header hex_list = [] while hex_s: str_tp = hex_s[0:2] hex_list.append(str_tp) hex_s = hex_s[2:] res = "" # 大小端倒置,逆序遍历 for w in reversed(hex_list): res += w print('reversed hex str = ', res) bys = bytes.fromhex(res) print('bytes = ', bys) return bys.decode('gbk') string = '中国技术交易大厦' val_l = [] for ss in string: val = wd_2_gbk_val(ss) val_l.append( val ) print( ss, " = ", val) print(val_l) for val in val_l: print('------------------') print(val, " = ", gbk_val_2_wd(val))输出结果:
中 = 53462 国 = 64185 技 = 48316 术 = 62922 交 = 48061 易 = 55250 大 = 62388 厦 = 50127 [53462, 64185, 48316, 62922, 48061, 55250, 62388, 50127] ------------------ hex str = 0xd0d6 reversed hex str = d6d0 bytes = b'\xd6\xd0' 53462 = 中 ------------------ hex str = 0xfab9 reversed hex str = b9fa bytes = b'\xb9\xfa' 64185 = 国 ------------------ hex str = 0xbcbc reversed hex str = bcbc bytes = b'\xbc\xbc' 48316 = 技 ------------------ hex str = 0xf5ca reversed hex str = caf5 bytes = b'\xca\xf5' 62922 = 术 ------------------ hex str = 0xbbbd reversed hex str = bdbb bytes = b'\xbd\xbb' 48061 = 交 ------------------ hex str = 0xd7d2 reversed hex str = d2d7 bytes = b'\xd2\xd7' 55250 = 易 ------------------ hex str = 0xf3b4 reversed hex str = b4f3 bytes = b'\xb4\xf3' 62388 = 大 ------------------ hex str = 0xc3cf reversed hex str = cfc3 bytes = b'\xcf\xc3' 50127 = 厦
char buf[2] = {0}; short word = 53462; // ‘中’ // 0xd0d6 memcpy(buf, &word, 2); // [-42, -48], 0xd0是-48的补码, 0xd6是-42的补码 // 0xd6 0xd0 unsigned char* byte_ptr = (unsigned char*)&word; // 小端存储 unsigned char byte0 = byte_ptr[0]; // 214 0xd6,0xd6是214的原码 unsigned char byte1 = byte_ptr[1]; // 208 0xd0,0xd0是208的原码
unsigned short word存储汉字‘中’的gbk2312码值,其内容拷贝到char buf数组中,buf内容为负数,buf[0]为word的低字节,buf[1]为高字节。通过byte_ptr指针取得内容为正数。
内存高地址 | 内存 低地址
d0 | d6
-48 | -42 // signed char,对应的负数补码
208 | 214 // unsigned char 对应的正数原码
buf[0] | buf[1]
取低地址, 取高地址
-42 | -48
Unicode2GBK函数:
/** * 将unicode字符串转化成gbk2312编码的字符串 */ int Unicode2GBK( wchar_t *pUnicode, char** ppDest) { #ifndef CODE_PAGE_GB18030 #define CODE_PAGE_GB18030 54936 #endif // get the size of the dest string const int size = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0/* you can do more for it*/, pUnicode, -1, 0, 0, 0, 0 ); if ( size == 0 ) { return -1; } char* pDestString = new char[size + 2]; ::memset( pDestString, 0, sizeof(pDestString) ); // transform int ret = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0, pUnicode, -1, pDestString, size, 0, 0 ); if( ret == 0 ) { delete pDestString; // 失败 return -1; } else { *ppDest = pDestString; return 0; } return -1; }