续上一篇:源码:处理文件格式和字符集的相关代码(3-1)-CSDN博客
目录
五、主入口:检测和转换
5.1 总体逻辑
5.2 检测BOM
5.3 单字节检测
5.4 utf-16和utf-32检测
5.5 转换格式
六、主入口:预览(正确显示文本文件)
这个代码有一点长,但是属于流水账,分成了几个独立的部分,但是有顺序依赖关系。
public static FileResult ProcessFile(string file, bool checkonly)
{
FileResult fileResult = new FileResult();//注意,默认值isText为true
FileStream filestream = new FileStream(file, FileMode.Open, FileAccess.Read);
byte[] data = new byte[filestream.Length];
filestream.Read(data, 0, (int)filestream.Length);
filestream.Close();
//MessageBox.Show(data.Length.ToString(),file);
int BOMsize = 0;
//检测BOM
if (data.Length >= 3 && data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF')
{
fileResult.isUTF8 = true;
fileResult.withBOM = true;
fileResult.BOM = "EF BB BF";
BOMsize = 3;
}
else if 。。。。。。此段得到BOM信息
//单字节方式检测,多字节文件会被误认为非文本
if (!fileResult.withBOM || fileResult.isUTF8)
{
。。。。。。此段检测没有BOM的和有UTF-8的BOM的
}
//多字节方式检测
if (!fileResult.isASCII && !fileResult.isLocal && !fileResult.isUTF8)
{
。。。。。。此段检测UTF-16和UTF-32,再区分字节序,一共四种情形
}
仅检测则下面就返回了
if (checkonly)
{
if (fileResult.isText) fileResult.status = Program.isLanguageZH ? "检测完成" : "Checked";
return fileResult;//仅检测
}
if (!fileResult.isText) return fileResult;//非文本文件
File.SetAttributes(file, File.GetAttributes(file) & ~FileAttributes.ReadOnly);
//输出
{
。。。。。。此段根据输出参数输出结果,覆盖输入文件
}
fileResult.status = "OK";
++file_procceed_count;
return fileResult;
}
//检测BOM
if (data.Length >= 3 && data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF')
{
fileResult.isUTF8 = true;
fileResult.withBOM = true;
fileResult.BOM = "EF BB BF";
BOMsize = 3;
}
else if (data.Length >= 4 && data[0] == '\xFF' && data[1] == '\xFE' && data[2] == '\x00' && data[3] == '\x00')
{
fileResult.isUTF32 = true;
fileResult.withBOM = true;
fileResult.BOM = "FF FE 00 00";
fileResult.isBigEndian = false;
BOMsize = 4;
}
else if (data.Length >= 4 && data[0] == '\x00' && data[1] == '\x00' && data[2] == '\xFE' && data[3] == '\xFF')
{
fileResult.isUTF32 = true;
fileResult.withBOM = true;
fileResult.BOM = "00 00 FE FF";
fileResult.isBigEndian = true;
BOMsize = 4;
}
else if (data.Length >= 2 && data[0] == '\xFF' && data[1] == '\xFE')
{
fileResult.isUTF16 = true;
fileResult.withBOM = true;
fileResult.BOM = "FF FE";
fileResult.isBigEndian = false;
BOMsize = 2;
}
else if (data.Length >= 2 && data[0] == '\xFE' && data[1] == '\xFF')
{
fileResult.isUTF16 = true;
fileResult.withBOM = true;
fileResult.BOM = "FE FF";
fileResult.isBigEndian = true;
BOMsize = 2;
}
else
{
}
关于BOM的全部知识都在代码里了。
文本文件不能有0,纯ASCII没有大于127的字符,UTF-8有单独的检测,必须符合UTF-8编码规则,其余则认为是本地编码。
//单字节方式检测,多字节文件会被误认为非文本
if (!fileResult.withBOM || fileResult.isUTF8)
{
bool NotASCII = false;
bool NotLocal = false;
for (int i = BOMsize; i < data.Length; ++i)
{
if (data[i] == 0)
{
//MessageBox.Show("非文本文件", file);//也包括了没有BOM的UTF-16、UTF-32文件
fileResult.isText = false;
break;
}
else if (data[i] == '\r' && i + 1 < data.Length && data[i + 1] == '\n')
{
++fileResult.count_CRLF;
++i;
}
else if (data[i] == '\r') ++fileResult.count_CR;
else if (data[i] == '\n') ++fileResult.count_LF;
else if (data[i] > 127)
{
NotASCII = true;//不是ASCII
if (i + 1 < data.Length && data[i + 1] > 127)
{
++i;//是一个local字符,跳过
}
else
{
NotLocal = true;
}
}
}
if (fileResult.isText)
{
if (NotLocal)
{
if (NotASCII)
{
fileResult.isUTF8 = true;//不是纯ASCII也不符合Local,只能是UTF-8
}
else
{
if (!fileResult.isUTF8) fileResult.isASCII = true;//纯ASCII,没有BOM则是ASCII(当然也符合UTF-8)
}
}
else
{
if (!NotASCII)
{
if (!fileResult.isUTF8) fileResult.isASCII = true;//纯ASCII,没有BOM则是ASCII(当然也符合UTF-8)
}
else
{
if (!fileResult.isUTF8)
{//符合Local又不是纯ASCII,没有BOM假设为Local,(当然可能不对,需要进一步检查是否符合utf-8规则)
if (checkUTF8(data, BOMsize)) fileResult.isUTF8 = true;
else fileResult.isLocal = true;
}
}
}
if (fileResult.withBOM && fileResult.isUTF8)
{
if (!checkUTF8(data, BOMsize))
{
fileResult.isUTF8 = false;
fileResult.isText = false;
}
}
}
}
四种格式逐一检测,但是有BOM的只检测BOM指示的那一种,因为没有可能是其它(如果不符合这种规则则必然是二进制文件)。
//多字节方式检测
if (!fileResult.isASCII && !fileResult.isLocal && !fileResult.isUTF8)
{
int tmpCR = 0;
int tmpCRLF = 0;
int tmpLF = 0;
bool bCheckResult = false;
do
{
if (!fileResult.withBOM || fileResult.isUTF32 && fileResult.isBigEndian)
{
if (checkUTF(data, 4, true, ref tmpCR, ref tmpCRLF, ref tmpLF))
{
bCheckResult = true;
fileResult.isUTF32 = true;
fileResult.isBigEndian = true;
fileResult.count_CR = tmpCR;
fileResult.count_CRLF = tmpCRLF;
fileResult.count_LF = tmpLF;
}
}
if (!fileResult.withBOM || fileResult.isUTF32 && !fileResult.isBigEndian)
{
if (checkUTF(data, 4, false, ref tmpCR, ref tmpCRLF, ref tmpLF) && tmpCR + tmpCRLF + tmpLF >= fileResult.count_CR + fileResult.count_CRLF + fileResult.count_LF)
{
bCheckResult = true;
fileResult.isUTF32 = true;
fileResult.isBigEndian = false;
fileResult.count_CR = tmpCR;
fileResult.count_CRLF = tmpCRLF;
fileResult.count_LF = tmpLF;
}
}
if (!fileResult.withBOM || fileResult.isUTF16 && fileResult.isBigEndian)
{
if (checkUTF(data, 2, true, ref tmpCR, ref tmpCRLF, ref tmpLF))
{
bCheckResult = true;
fileResult.isUTF32 = false;//注意前面可能已经设置了isUTF32=true
fileResult.isUTF16 = true;
fileResult.isBigEndian = true;
fileResult.count_CR = tmpCR;
fileResult.count_CRLF = tmpCRLF;
fileResult.count_LF = tmpLF;
}
}
if (!fileResult.withBOM || fileResult.isUTF16 && !fileResult.isBigEndian)
{
if (checkUTF(data, 2, false, ref tmpCR, ref tmpCRLF, ref tmpLF) && tmpCR + tmpCRLF + tmpLF >= fileResult.count_CR + fileResult.count_CRLF + fileResult.count_LF)
{
bCheckResult = true;
fileResult.isUTF32 = false;//注意前面可能已经设置了isUTF32=true
fileResult.isUTF16 = true;
fileResult.isBigEndian = false;
fileResult.count_CR = tmpCR;
fileResult.count_CRLF = tmpCRLF;
fileResult.count_LF = tmpLF;
}
}
} while (false);
fileResult.isText = bCheckResult;
}
根据设置的输出参数改变行结束符、添加或删除BOM。
//输出
{
FileStream outfile = new FileStream(file, FileMode.Truncate, FileAccess.Write);
int charWidth = 1;
int pos = 0;
if (fileResult.isUTF16)
{
charWidth = 2;
pos = (fileResult.isBigEndian ? charWidth - 1 : 0);
}
if (fileResult.isUTF32)
{
charWidth = 4;
pos = (fileResult.isBigEndian ? charWidth - 1 : 0);
}
int skipBOM = 0;
if (output_bom == 1)
{
if (!fileResult.withBOM)
{
if (fileResult.isUTF8) { outfile.WriteByte(0xEF); outfile.WriteByte(0xBB); outfile.WriteByte(0xBF); }
if (fileResult.isUTF16)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0xFE); outfile.WriteByte(0xFF); }
else { outfile.WriteByte(0xFF); outfile.WriteByte(0xFE); }
}
if (fileResult.isUTF32)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0xFE); outfile.WriteByte(0xFF); }
else { outfile.WriteByte(0xFF); outfile.WriteByte(0xFE); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
}
}
}
if (output_bom == 2)
{
if (fileResult.withBOM)
{
if (fileResult.isUTF8) skipBOM = 3;
if (fileResult.isUTF16) skipBOM = 2;
if (fileResult.isUTF32) skipBOM = 4;
}
}
for (int i = skipBOM; i < data.Length; i += charWidth)
{
bool isNewLine = false;
bool isCRLF = false;
if (IsTheChar(data, charWidth, pos, i, '\r') && i < data.Length - charWidth && IsTheChar(data, charWidth, pos, i + charWidth, '\n'))
{
isNewLine = true;
isCRLF = true;
}
else
{
if (IsTheChar(data, charWidth, pos, i, '\r') || IsTheChar(data, charWidth, pos, i, '\n')) isNewLine = true;
}
if (isNewLine && 0 != output_format)
{
if (output_format == 1)
{
if (fileResult.isUTF16)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); outfile.WriteByte(0x00); }
}
else if (fileResult.isUTF32)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
}
else { outfile.WriteByte(0x0D); outfile.WriteByte(0x0A); }
}
if (output_format == 2)
{
if (fileResult.isUTF16)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
else { outfile.WriteByte(0x0A); outfile.WriteByte(0x00); }
}
else if (fileResult.isUTF32)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
else { outfile.WriteByte(0x0A); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
}
else { outfile.WriteByte(0x0A); }
}
if (output_format == 3)
{
if (fileResult.isUTF16)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0D); }
else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); }
}
else if (fileResult.isUTF32)
{
if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0D); }
else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
}
else { outfile.WriteByte(0x0D); }
}
if (isCRLF) i += charWidth;
}
else
{
for (int j = 0; j < charWidth; ++j)
{
outfile.WriteByte(data[i + j]);
}
}
}
//MessageBox.Show(outfile.Length.ToString(), file);
outfile.Close();
}
(这里是文档结束)