源码:处理文件格式和字符集的相关代码(3-2)

续上一篇:源码:处理文件格式和字符集的相关代码(3-1)-CSDN博客

目录

五、主入口:检测和转换

5.1 总体逻辑

5.2 检测BOM

5.3 单字节检测

5.4 utf-16和utf-32检测

5.5 转换格式

六、主入口:预览(正确显示文本文件)

 


五、主入口:检测和转换

5.1 总体逻辑

        这个代码有一点长,但是属于流水账,分成了几个独立的部分,但是有顺序依赖关系。

		public static FileResult ProcessFile(string file, bool checkonly)
		{
			FileResult fileResult = new FileResult();//注意,默认值isText为true
			FileStream filestream = new FileStream(file, FileMode.Open, FileAccess.Read);
			byte[] data = new byte[filestream.Length];
			filestream.Read(data, 0, (int)filestream.Length);
			filestream.Close();
			//MessageBox.Show(data.Length.ToString(),file);

			int BOMsize = 0;

			//检测BOM
			if (data.Length >= 3 && data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF')
			{
				fileResult.isUTF8 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "EF BB BF";
				BOMsize = 3;
			}
			else if 。。。。。。此段得到BOM信息

			//单字节方式检测,多字节文件会被误认为非文本
			if (!fileResult.withBOM || fileResult.isUTF8)
			{
				。。。。。。此段检测没有BOM的和有UTF-8的BOM的
			}
			//多字节方式检测
			if (!fileResult.isASCII && !fileResult.isLocal && !fileResult.isUTF8)
			{
				。。。。。。此段检测UTF-16和UTF-32,再区分字节序,一共四种情形
			}

            仅检测则下面就返回了

			if (checkonly)
			{
				if (fileResult.isText) fileResult.status = Program.isLanguageZH ? "检测完成" : "Checked";
				return fileResult;//仅检测
			}


			if (!fileResult.isText) return fileResult;//非文本文件

			File.SetAttributes(file, File.GetAttributes(file) & ~FileAttributes.ReadOnly);

			//输出
			{
				。。。。。。此段根据输出参数输出结果,覆盖输入文件
			}

			fileResult.status = "OK";
			++file_procceed_count;
			return fileResult;
		}

5.2 检测BOM

			//检测BOM
			if (data.Length >= 3 && data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF')
			{
				fileResult.isUTF8 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "EF BB BF";
				BOMsize = 3;
			}
			else if (data.Length >= 4 && data[0] == '\xFF' && data[1] == '\xFE' && data[2] == '\x00' && data[3] == '\x00')
			{
				fileResult.isUTF32 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "FF FE 00 00";
				fileResult.isBigEndian = false;
				BOMsize = 4;
			}
			else if (data.Length >= 4 && data[0] == '\x00' && data[1] == '\x00' && data[2] == '\xFE' && data[3] == '\xFF')
			{
				fileResult.isUTF32 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "00 00 FE FF";
				fileResult.isBigEndian = true;
				BOMsize = 4;
			}
			else if (data.Length >= 2 && data[0] == '\xFF' && data[1] == '\xFE')
			{
				fileResult.isUTF16 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "FF FE";
				fileResult.isBigEndian = false;
				BOMsize = 2;
			}
			else if (data.Length >= 2 && data[0] == '\xFE' && data[1] == '\xFF')
			{
				fileResult.isUTF16 = true;
				fileResult.withBOM = true;
				fileResult.BOM = "FE FF";
				fileResult.isBigEndian = true;
				BOMsize = 2;
			}
			else
			{
			}

        关于BOM的全部知识都在代码里了。

5.3 单字节检测

        文本文件不能有0,纯ASCII没有大于127的字符,UTF-8有单独的检测,必须符合UTF-8编码规则,其余则认为是本地编码。

			//单字节方式检测,多字节文件会被误认为非文本
			if (!fileResult.withBOM || fileResult.isUTF8)
			{
				bool NotASCII = false;
				bool NotLocal = false;
				for (int i = BOMsize; i < data.Length; ++i)
				{
					if (data[i] == 0)
					{
						//MessageBox.Show("非文本文件", file);//也包括了没有BOM的UTF-16、UTF-32文件
						fileResult.isText = false;
						break;
					}
					else if (data[i] == '\r' && i + 1 < data.Length && data[i + 1] == '\n')
					{
						++fileResult.count_CRLF;
						++i;
					}
					else if (data[i] == '\r') ++fileResult.count_CR;
					else if (data[i] == '\n') ++fileResult.count_LF;
					else if (data[i] > 127)
					{
						NotASCII = true;//不是ASCII
						if (i + 1 < data.Length && data[i + 1] > 127)
						{
							++i;//是一个local字符,跳过
						}
						else
						{
							NotLocal = true;
						}
					}
				}
				if (fileResult.isText)
				{
					if (NotLocal)
					{
						if (NotASCII)
						{
							fileResult.isUTF8 = true;//不是纯ASCII也不符合Local,只能是UTF-8
						}
						else
						{
							if (!fileResult.isUTF8) fileResult.isASCII = true;//纯ASCII,没有BOM则是ASCII(当然也符合UTF-8)
						}
					}
					else
					{
						if (!NotASCII)
						{
							if (!fileResult.isUTF8) fileResult.isASCII = true;//纯ASCII,没有BOM则是ASCII(当然也符合UTF-8)
						}
						else
						{
							if (!fileResult.isUTF8)
							{//符合Local又不是纯ASCII,没有BOM假设为Local,(当然可能不对,需要进一步检查是否符合utf-8规则)
								if (checkUTF8(data, BOMsize)) fileResult.isUTF8 = true;
								else fileResult.isLocal = true;
							}
						}
					}

					if (fileResult.withBOM && fileResult.isUTF8)
					{
						if (!checkUTF8(data, BOMsize))
						{
							fileResult.isUTF8 = false;
							fileResult.isText = false;
						}
					}
				}
			}

5.4 utf-16和utf-32检测

        四种格式逐一检测,但是有BOM的只检测BOM指示的那一种,因为没有可能是其它(如果不符合这种规则则必然是二进制文件)。

			//多字节方式检测
			if (!fileResult.isASCII && !fileResult.isLocal && !fileResult.isUTF8)
			{
				int tmpCR = 0;
				int tmpCRLF = 0;
				int tmpLF = 0;
				bool bCheckResult = false;

				do
				{
					if (!fileResult.withBOM || fileResult.isUTF32 && fileResult.isBigEndian)
					{
						if (checkUTF(data, 4, true, ref tmpCR, ref tmpCRLF, ref tmpLF))
						{
							bCheckResult = true;
							fileResult.isUTF32 = true;
							fileResult.isBigEndian = true;
							fileResult.count_CR = tmpCR;
							fileResult.count_CRLF = tmpCRLF;
							fileResult.count_LF = tmpLF;
						}
					}
					if (!fileResult.withBOM || fileResult.isUTF32 && !fileResult.isBigEndian)
					{
						if (checkUTF(data, 4, false, ref tmpCR, ref tmpCRLF, ref tmpLF) && tmpCR + tmpCRLF + tmpLF >= fileResult.count_CR + fileResult.count_CRLF + fileResult.count_LF)
						{
							bCheckResult = true;
							fileResult.isUTF32 = true;
							fileResult.isBigEndian = false;
							fileResult.count_CR = tmpCR;
							fileResult.count_CRLF = tmpCRLF;
							fileResult.count_LF = tmpLF;
						}
					}
					if (!fileResult.withBOM || fileResult.isUTF16 && fileResult.isBigEndian)
					{
						if (checkUTF(data, 2, true, ref tmpCR, ref tmpCRLF, ref tmpLF))
						{
							bCheckResult = true;
							fileResult.isUTF32 = false;//注意前面可能已经设置了isUTF32=true
							fileResult.isUTF16 = true;
							fileResult.isBigEndian = true;
							fileResult.count_CR = tmpCR;
							fileResult.count_CRLF = tmpCRLF;
							fileResult.count_LF = tmpLF;
						}
					}
					if (!fileResult.withBOM || fileResult.isUTF16 && !fileResult.isBigEndian)
					{
						if (checkUTF(data, 2, false, ref tmpCR, ref tmpCRLF, ref tmpLF) && tmpCR + tmpCRLF + tmpLF >= fileResult.count_CR + fileResult.count_CRLF + fileResult.count_LF)
						{
							bCheckResult = true;
							fileResult.isUTF32 = false;//注意前面可能已经设置了isUTF32=true
							fileResult.isUTF16 = true;
							fileResult.isBigEndian = false;
							fileResult.count_CR = tmpCR;
							fileResult.count_CRLF = tmpCRLF;
							fileResult.count_LF = tmpLF;
						}
					}
				} while (false);

				fileResult.isText = bCheckResult;
			}

5.5 转换格式

        根据设置的输出参数改变行结束符、添加或删除BOM。

			//输出
			{
				FileStream outfile = new FileStream(file, FileMode.Truncate, FileAccess.Write);
				int charWidth = 1;
				int pos = 0;
				if (fileResult.isUTF16)
				{
					charWidth = 2;
					pos = (fileResult.isBigEndian ? charWidth - 1 : 0);
				}
				if (fileResult.isUTF32)
				{
					charWidth = 4;
					pos = (fileResult.isBigEndian ? charWidth - 1 : 0);
				}

				int skipBOM = 0;
				if (output_bom == 1)
				{
					if (!fileResult.withBOM)
					{
						if (fileResult.isUTF8) { outfile.WriteByte(0xEF); outfile.WriteByte(0xBB); outfile.WriteByte(0xBF); }
						if (fileResult.isUTF16)
						{
							if (fileResult.isBigEndian) { outfile.WriteByte(0xFE); outfile.WriteByte(0xFF); }
							else { outfile.WriteByte(0xFF); outfile.WriteByte(0xFE); }
						}
						if (fileResult.isUTF32)
						{
							if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0xFE); outfile.WriteByte(0xFF); }
							else { outfile.WriteByte(0xFF); outfile.WriteByte(0xFE); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
						}
					}
				}
				if (output_bom == 2)
				{
					if (fileResult.withBOM)
					{
						if (fileResult.isUTF8) skipBOM = 3;
						if (fileResult.isUTF16) skipBOM = 2;
						if (fileResult.isUTF32) skipBOM = 4;
					}
				}
				for (int i = skipBOM; i < data.Length; i += charWidth)
				{
					bool isNewLine = false;
					bool isCRLF = false;

					if (IsTheChar(data, charWidth, pos, i, '\r') && i < data.Length - charWidth && IsTheChar(data, charWidth, pos, i + charWidth, '\n'))
					{
						isNewLine = true;
						isCRLF = true;
					}
					else
					{
						if (IsTheChar(data, charWidth, pos, i, '\r') || IsTheChar(data, charWidth, pos, i, '\n')) isNewLine = true;
					}

					if (isNewLine && 0 != output_format)
					{
						if (output_format == 1)
						{
							if (fileResult.isUTF16)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
								else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); outfile.WriteByte(0x00); }
							}
							else if (fileResult.isUTF32)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
								else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
							}
							else { outfile.WriteByte(0x0D); outfile.WriteByte(0x0A); }
						}
						if (output_format == 2)
						{
							if (fileResult.isUTF16)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
								else { outfile.WriteByte(0x0A); outfile.WriteByte(0x00); }
							}
							else if (fileResult.isUTF32)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0A); }
								else { outfile.WriteByte(0x0A); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
							}
							else { outfile.WriteByte(0x0A); }
						}
						if (output_format == 3)
						{
							if (fileResult.isUTF16)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x0D); }
								else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); }
							}
							else if (fileResult.isUTF32)
							{
								if (fileResult.isBigEndian) { outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x0D); }
								else { outfile.WriteByte(0x0D); outfile.WriteByte(0x00); outfile.WriteByte(0x00); outfile.WriteByte(0x00); }
							}
							else { outfile.WriteByte(0x0D); }
						}

						if (isCRLF) i += charWidth;
					}
					else
					{
						for (int j = 0; j < charWidth; ++j)
						{
							outfile.WriteByte(data[i + j]);
						}
					}
				}
				//MessageBox.Show(outfile.Length.ToString(), file);
				outfile.Close();
			}

六、主入口:预览(正确显示文本文件)


(这里是文档结束)

你可能感兴趣的:(文本格式与字符编码,文本格式,字符编码,UNICODE,utf,代码页,BOM,乱码)