过滤 外文unicode文本中字符的代码

Author: godspirit

 

This source code is used for filtering unexpected character of a the unicode TXT document. And it will generate formated TXT. e.g. The chinese character would be kicked off from the orignal file.

 

If you want to check for the unicode of certain character, I advice you run "charmap" command, which is useful in Windows command and linux X windows.

 

 

 

 

#include <stdio.h>

#define SPACE (0x20)

#define PORTUGUESE (1)
//#define ITALIAN  (1)
//#define SPANISH  (1)
//#define FRANCH  (1)

static unsigned short LastShort[3] = {0};
unsigned short IsComma(unsigned short data);
static unsigned short ThrowFlag = 0;

void UpdateLastShort(unsigned short current)
{
 int i = 3;
 for(; i>0; i--)
 {
  LastShort[i] = LastShort[i-1];
 }
 LastShort[0] = current;
}

unsigned short IsUnicodeOK(unsigned short current)
{
 if((IsComma(current) && IsComma(LastShort[0])
 ||(current == 0x0d) && ((LastShort[1] == 0x0d)||(LastShort[1] == 0x0a))
 ||(current == 0x0a) && ((LastShort[1] == 0x0a)||(LastShort[1] == 0x0d))
 ||((current == 0x3a) && (LastShort[0] == 0x3a))
 ||((current == 0x22) && (LastShort[0] == 0x22))
 ||((current == 0x27) && (LastShort[0] == 0x27))
 ||((current == 0x2d) && (LastShort[0] == 0x2d))
 ||((current == SPACE) && ((LastShort[0] == SPACE)||(LastShort[0] == 0x0d) || (LastShort[0] == 0x0a))) ))
  return 0;
 UpdateLastShort(current);
  return 1; 

}


unsigned short IsComma(unsigned short data)
{
 unsigned short CurRusCharUni = data;
 if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x003b)// ;
  ||((CurRusCharUni == 0x0d)&&(LastShort[0] == 0x0a))
  ||((CurRusCharUni == 0x0a)&&(LastShort[0] == 0x0d))
  || (CurRusCharUni == 0x002c) || (CurRusCharUni == 0x002e))// , .
  return 1;
 else
  return 0;
}

unsigned short IsStop(unsigned short data)
{
        unsigned short CurRusCharUni = data;
        if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
         ||((CurRusCharUni == 0x0d)&&(LastShort[0] == 0x0a))
         ||((CurRusCharUni == 0x0a)&&(LastShort[0] == 0x0d))
           || (CurRusCharUni == 0x002e))// .
                return 1;
        else
                return 0;
 
}

unsigned short IsDivision(unsigned short data)
{
 unsigned short CurRusCharUni = data;
    if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x000d) || (CurRusCharUni == 0x000a)// /r/n
  || (CurRusCharUni >= 0x0030  && CurRusCharUni <= 0x0039)//Numbers
  || (CurRusCharUni == 0x0022) || (CurRusCharUni == 0x0027)// " '
  || (CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x003a) || (CurRusCharUni == 0x003b)// : ;
  || (CurRusCharUni >= 0x002c  && CurRusCharUni <= 0x002e)// , - .
  || (CurRusCharUni >= 0x0028  && CurRusCharUni <= 0x0029)
  || (CurRusCharUni == 0x0020))//space
  return 1;
 else
  return 0;
}

/*
if we found a unexpected character, we should kick off the whole word.
*/

unsigned short Put2File(unsigned short current, FILE *fp, unsigned short throwFlag)
{
 FILE *fpNew = fp;
 static unsigned short tempbuf[128] = {0};
 static unsigned short count = 0;
 
 if(throwFlag == 0)
 {
  if(IsUnicodeOK(current) == 0)
   return 1;
  tempbuf[count] = current;
  count ++;
 }
 
 if(IsDivision(current) && (count != 0))
 {
  fwrite(tempbuf,sizeof(unsigned short),count,fpNew);
  count = 0;  
 }
 return 0;
}


unsigned short UnicodeFilter(char * FileName)
{

 FILE *fp = NULL;
 FILE *fpNew = NULL;
 FILE *fpThrow = NULL;
 unsigned long ThrowCnt = 0;
 unsigned short UnicodeHeader = 0;
 unsigned short CurRusCharUni = 0;//每个当前读出的俄文字母的unicode码
 char newFileName[128] = {0};
 char ThrowFileName[128] = {0};
 fp = fopen(FileName,"rb"); 
 if (fp == NULL)
 {
  printf("File Open Failed~!/n");
  return 1;
 }
 fread(&UnicodeHeader,sizeof(unsigned short),1,fp);
 //if satisfy the header requirement of the unicode file
 sprintf(newFileName,"new_%s",FileName);
 sprintf(ThrowFileName,"throw_%s",FileName);
 
 if (UnicodeHeader == 0xfeff)
 {
  fpNew = fopen(newFileName,"ab");
  fpThrow = fopen(ThrowFileName,"ab");
  if ((fpNew == NULL)|| (fpThrow == NULL))
  {
   printf("newFileName Open Failed~!/n");
   return 1;
  }
  
  while(fread(&CurRusCharUni,sizeof(unsigned short),1,fp))
  {
//   printf("Get a char unicode= %x/n",CurRusCharUni);

   if    ((CurRusCharUni >= 0x0041 && CurRusCharUni <= 0x005A) //A-->Z
    || (CurRusCharUni >= 0x0061 && CurRusCharUni <= 0x007A) //a-->z
#if defined(PORTUGUESE)
    || (CurRusCharUni >= 0x00C0 && CurRusCharUni <= 0x00C3) //Portuguese specific
    || (CurRusCharUni >= 0x00E0 && CurRusCharUni <= 0x00E3) //Portuguese specific
    || (CurRusCharUni == 0x00C7) || (CurRusCharUni == 0x00E7) //Portuguese specific
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9) //Portuguese specific
    || (CurRusCharUni == 0x00CA) || (CurRusCharUni == 0x00EA) //Portuguese specific
    || (CurRusCharUni == 0x00CD) || (CurRusCharUni == 0x00ED) //Portuguese specific
    || (CurRusCharUni >= 0x00D3  && CurRusCharUni <= 0x00D5) //Portuguese specific
    || (CurRusCharUni >= 0x00F3  && CurRusCharUni <= 0x00F5) //Portuguese specific
    || (CurRusCharUni == 0x00DA || CurRusCharUni == 0x00FA) //Portuguese specific
    || (CurRusCharUni == 0x00DC || CurRusCharUni == 0x00FC) //Portuguese specific
#elif defined(ITALIAN)
    || (CurRusCharUni == 0x00C8) || (CurRusCharUni == 0x00E8) //Italian
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9) //Italian
    || (CurRusCharUni == 0x00D2) || (CurRusCharUni == 0x00F2) //Italian
    || (CurRusCharUni == 0x00D3) || (CurRusCharUni == 0x00F3) //Italian
#elif defined(SPANISH)
    || (CurRusCharUni == 0x00C1) || (CurRusCharUni == 0x00E1)
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9)
    || (CurRusCharUni == 0x00CD) || (CurRusCharUni == 0x00ED)
    || (CurRusCharUni == 0x00D1) || (CurRusCharUni == 0x00F1)
    || (CurRusCharUni == 0x00D3) || (CurRusCharUni == 0x00F3)
    || (CurRusCharUni == 0x00DA) || (CurRusCharUni == 0x00FA)
    || (CurRusCharUni == 0x00DC) || (CurRusCharUni == 0x00FC)
    
#elif defined(FRANCH)
    || (CurRusCharUni == 0x00C0) || (CurRusCharUni == 0x00E0)
    || (CurRusCharUni == 0x00C2) || (CurRusCharUni == 0x00E2)
    || (CurRusCharUni >= 0x00C7) && (CurRusCharUni <= 0x00CB)
    || (CurRusCharUni >= 0x00E7) && (CurRusCharUni <= 0x00EB)
    || (CurRusCharUni == 0x00CE) || (CurRusCharUni == 0x00EE)
    || (CurRusCharUni == 0x00CF) || (CurRusCharUni == 0x00EF)
    || (CurRusCharUni == 0x00D4) || (CurRusCharUni == 0x00F4)
    || (CurRusCharUni == 0x00DB) || (CurRusCharUni == 0x00FB)
    || (CurRusCharUni == 0x00D9) || (CurRusCharUni == 0x00F9)
    || (CurRusCharUni == 0x00DC) || (CurRusCharUni == 0x00FC)
    || (CurRusCharUni == 0x0178) || (CurRusCharUni == 0x00FF)
#endif
    || IsDivision(CurRusCharUni))// validate character..
   {
    //写入新的文件
    Put2File(CurRusCharUni, fpNew, 0);
   }
   else //Replace some interpunction
   {
    printf("A unexpected char -[0x%x]- detected~/n", CurRusCharUni);
    if (CurRusCharUni == 0xff1f)//?
    {
     CurRusCharUni = 0x003f;
    }
    else if (CurRusCharUni == 0xff1c)//,
    {
     CurRusCharUni = 0x002c;
    }
    else if (CurRusCharUni == 0xff0e)//.
    {
     CurRusCharUni = 0x002e;
    }
    else if (CurRusCharUni == 0xff01)//!
    {
     CurRusCharUni = 0x0021;
    }
    else if ((CurRusCharUni == 0xff0d)||(CurRusCharUni >= 0x2013 && CurRusCharUni <= 0x2016))//-
    {
     CurRusCharUni = 0x002d;
    }
    else if (CurRusCharUni == 0xff1a)//:
    {
     CurRusCharUni = 0x003a;
    }
    else if (CurRusCharUni == 0xff1b)//;
    {
     CurRusCharUni = 0x003b;
    }
    else if ((CurRusCharUni == 0xff07)||(CurRusCharUni == 0x2018)||(CurRusCharUni == 0x2019))//'
    {
     CurRusCharUni = 0x0027;
    }
    else if ((CurRusCharUni == 0xff02)||(CurRusCharUni == 0x201d)||(CurRusCharUni == 0x201c))//"
    {
     CurRusCharUni = 0x0022;
    }
    else
    {
     Put2File(0x20, fpThrow, 1);
     fwrite(&CurRusCharUni,sizeof(unsigned short),1,fpThrow);
     
     ThrowCnt++;
     if (ThrowCnt%50 == 49)
     {
      printf("%d characters have been thrown out!/r/n",ThrowCnt);
     }
     continue;
    }
    Put2File(CurRusCharUni, fpNew, 0);
   }
  }
  fclose(fpThrow);
  fclose(fpNew);
 }

 fclose(fp);

 return 0;
}

#define UPDATE_PARTTIION_FILE /
do{  /
 FileCnt ++; /
 WordCnt = 0; /
 sprintf(partitionFileName, "%d_%s",FileCnt, newFileName); /
 fclose(fpPartition); /
 fpPartition = fopen(partitionFileName,"ab"); /
 fwrite(newline, sizeof(unsigned short), 4, fpFormat); /
 fwrite(newline, sizeof(unsigned short), 4, fpPartition); /
}while(0)

 

unsigned short FilePartition(char * FileName)
{
 FILE *fpNew = NULL;
 FILE *fpPartition = NULL;
 FILE *fpFormat = NULL;
 unsigned long ThrowCnt = 0;
 unsigned short UnicodeHeader = 0;
 unsigned short CurRusCharUni = 0;//个岸s鲰卓的nicode?
 int FileCnt = 1, WordCnt = 0;
 unsigned short newline[4] = {0x0d,0x0a,0x0d,0x0a};
 char newFileName[128] = {0};
 char partitionFileName[128] = {0};
 char FormatFileName[128] = {0};
 
 sprintf(newFileName,"new_%s",FileName);
 sprintf(FormatFileName,"format_%s",FileName);
 sprintf(partitionFileName, "%d_%s",FileCnt,newFileName);

     
 fpPartition = fopen(partitionFileName,"ab");
 fpFormat = fopen(FormatFileName,"ab");
 fpNew = fopen(newFileName,"rb");
    if ((fpNew == NULL)||(fpPartition == NULL)||(fpFormat == NULL))
    {
            printf("FilePartition File Open Failed~!/n");
            return 1;
    }

 
    //fread(&UnicodeHeader,sizeof(unsigned short),1,fp);
    //printf("Get a char unicode= %x/n",UnicodeHeader); 
 
 while((fread(&CurRusCharUni,sizeof(unsigned short),1,fpNew ))&&(fpPartition != NULL))
 {
  fwrite(&CurRusCharUni, sizeof(unsigned short), 1, fpPartition);
  fwrite(&CurRusCharUni, sizeof(unsigned short), 1, fpFormat);
  
  if(WordCnt < 40)
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
  }
  else if((WordCnt >= 40)&&(WordCnt < 50))
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
   else if(IsStop(CurRusCharUni))
   {
    FileCnt ++;
    UPDATE_PARTTIION_FILE;
   }
  }
  else if((WordCnt >= 50)&&(WordCnt < 65))
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
   else if(IsComma(CurRusCharUni))
   {
    UPDATE_PARTTIION_FILE;
   }
  }
  else
  {
   if(CurRusCharUni == SPACE)
   {
    UPDATE_PARTTIION_FILE;
   }
  }
 }
 fclose(fpNew); 
        //if satisfy the header requirement of the unicode file
 return 0;
}

int main(int argc, char * argv[])
{

 unsigned short RltOfFilter = 0;
 char buf[128] ={0};
 if(argc < 2 )
 { 
  printf("Please input the name of TXT file:/n");
  return 0;
 }
 
 sprintf(buf, "rm -rf Output new_%s throw_%s format_%s", argv[1], argv[1], argv[1]);
 system(buf);
 sprintf(buf, "mkdir %s_Output",  argv[1]);
 system(buf);

 RltOfFilter = UnicodeFilter( argv[1] );

 if(RltOfFilter != 0)
 {
  printf("[ERROR]  Format TXT File failed/n");
  return 0;
 }

 RltOfFilter = FilePartition( argv[1] );

 if(RltOfFilter != 0)
 {
         printf("[ERROR]  Partition TXT File failed/n");
         return 0;
 }
 sprintf(buf, "mv *_new_* ./%s_Output",  argv[1]);
 system(buf);
 return 0;
}

 

你可能感兴趣的:(File,null,character,FP,output,newline)