单词统计

要求：读取一个文本，然后统计里面出现的单词，打印每个单词出现的次数。仅仅考虑英文单词的情形，不考虑中文

小乓练题：

int main( int argc, char * argv[])
{
     using namespace std;

    ifstream infile( " c:\\a.txt " ,ios::binary );
     if ( ! infile)
    {
        cout << " Can not open sourse file! " << endl;
         return 0 ;
    }
     // ofstream outfile("out.txt");
     // if(!outfile)
     // {
     //     cout<<"Can not open destination file!"<<endl;
     // }

     int nLength = 0 ;
     char * pBuffer;

     // get length of file:
    infile.seekg ( 0 , ios::end);
    nLength = infile.tellg();
    infile.seekg ( 0 , ios::beg);


     // read the file to the buffer
    pBuffer = new char [nLength];
    memset(pBuffer, 0 , nLength);
    infile.read(pBuffer,nLength);
    infile.close();

     // copy the buffer to the string s
     string s = pBuffer;
    delete[] pBuffer;
    pBuffer = NULL;

     string temp;

    vector < string > vecSubstr;
    vector < int > vecCount;

     int pre = 0 ,next = 0 ;

     while (next < nLength)
    {
        pre = next;
         // find the word
         while ((next < nLength) && isalnum(s[next]))
        {
            next ++ ;
        }
         if (pre != next)
        {
             // 计算当前的单词个数
            temp = s.substr(pre,next - pre);
            cout << temp << endl;
             // std::vector<std::string>::iterator iter = std::find(vecSubstr.begin(), vecSubstr.end(), temp);
             // if (vecSubstr.end() != iter)
             // {
             //     std::cout<<temp<<std::endl;
             // }else
             // {
             //     vecSubstr.push_back(temp);
             // }

            unsigned int iPosition = 0 ;


             while (iPosition < vecSubstr.size())
            {

                 if (vecSubstr[iPosition].compare(temp) == 0 )
                     break ;

                iPosition ++ ;

            }

             if (iPosition == vecSubstr.size())
            {
                vecSubstr.push_back(temp);
                vecCount.push_back( 1 );
            }
             else
            {
                vecCount[iPosition] ++ ;
            }

        }
        next ++ ;
    }
     for ( int j = 0 ;j < vecSubstr.size();j ++ )
    {
        cout << vecSubstr[j] << endl << vecCount[j] << endl;
    }

     // for(int i=0;i<substr.size();i++)
     // {
     //     cout<<substr[i]<<endl;
     //     cout<<count[i]<<endl;
     // }

     // delete[] pBuffer;
     // pBuffer = NULL;

    system( " pause " );

     return 0 ;
}

C++代码：

int main( int argc, char * argv[])
{

     // 文件路径
     char * szPath = " C:\\text.txt " ;

    std::ifstream fin(szPath);
     if ( ! fin)
    {
        std::cout << " Can not open file " << std::endl;
         return - 1 ;
    }
     // 通常我们这样读取一个文本文件的全文
    std:: string strText = std:: string (std::istreambuf_iterator < char > (fin), std::istreambuf_iterator < char > ());

    typedef std::map < std:: string , int > CountMap;
    CountMap counter;

     int nLength = strText.length();
     int nLeft = 0 ;
     int nRight = - 1 ;

     while (nRight < nLength)
    {
        nLeft = nRight + 1 ;
         // 找到第一个是字母的位置
         while (nLeft < nLength && ! isalnum(strText[nLeft]))
        {
             ++ nLeft;
        }
        nRight = nLeft + 1 ;
         // 找到第一个非字母的位置
         while (nRight < nLength && isalnum(strText[nRight]))
        {
             ++ nRight;
        }
         // 取nRight-nLeft可以保证取到的是一个word，其中不会含有字符
         if (nRight < nLength)
        {
             // 提取单词
            std:: string strWord = strText.substr(nLeft, nRight - nLeft);
             // 加入记数器
            counter[strWord] += 1 ;
        }
    }

     // 打印输出
     for (CountMap::iterator iter = counter.begin(); counter.end() != iter; ++ iter)
    {
        std::cout << iter -> first << " \t\t " << iter -> second << std::endl;
    }

    system( " pause " );
     return 0 ;
}

python 代码：

import re

filepath = r ' c:/text.txt '
with open(filepath) as file:
    text = file.read()
    text = re.split( ' \W+ ' , text)
    d = {}
     for item in text:
        d[item] = d.get(item, 0) + 1
     for key, value in d.items():
         print ( ' %s\t\t%s ' % (key, value))

小乓加油！

单词统计

单词统计

你可能感兴趣的:(单词统计)