统计文本中每个单词的序列
使用STL
/*统计文本中出现的单词的序列*/ #include <iostream> #include <fstream> #include <string> #include <set> using namespace std; int main() { string str; set<string> DistinctWordSet; set<string>::iterator it; ifstream in("word.txt"); while(in >> str) { DistinctWordSet.insert(str); } //输出 for (it = DistinctWordSet.begin();it != DistinctWordSet.end();it++) { cout<<*it<<endl; } system("pause"); return 1; }
STL实现
/*统计文本中每个单词的出现次数*/ #include <iostream> #include <fstream> #include <algorithm> #include <map> #include <string> using namespace std; int main() { string str; map<string,int> WordCountMap; map<string,int>::iterator it; ifstream in("word.txt");//打开文件 if (in.fail()) { cout<<"打开文件错误!"<<endl; exit(0); } while(in >> str) { transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写 WordCountMap[str]++; } in.close(); //输出 for (it = WordCountMap.begin();it != WordCountMap.end();it++) { cout<<it->first<<" "<<it->second<<endl; } system("pause"); return 1; }Hash实现
/*统计文本中每个单词的出现次数*/ #include <iostream> #include <assert.h> #include <string> #include <fstream> #include <algorithm> using namespace std; const int NHASH = 29989; const int MULT = 31; class StrNode { public: string word; unsigned int count; StrNode* next; public: StrNode(string str) : count(1),next(NULL),word(str){} }; class CountStr { public: CountStr(); public: unsigned int HashIndex(string str); void InsertWord(string str); void InitStr(string FileName); void Print(); private: StrNode* bin[NHASH]; }; CountStr::CountStr() { memset(bin,NULL,NHASH * sizeof(StrNode*)); } /*如字符串abc的Hash值为(97 *31 + 98) * 31 + 99*/ unsigned int CountStr::HashIndex(string str) { unsigned int index = 0; int strLen = str.size(); assert(strLen > 0); for (int i = 0;i < strLen;i++) { index = MULT * index + str.at(i); } return index % NHASH; } void CountStr::InsertWord(string str) { StrNode* p = NULL; unsigned int index = HashIndex(str); for (StrNode* p = bin[index];p != NULL;p = p->next) { if (str == p->word) { p->count++; return; } } p = new StrNode(str); //使用头插法插入节点 p->next = bin[index]; bin[index] = p; } void CountStr::InitStr(string fileName) { string str; ifstream in(fileName.c_str()); while(in >> str) { transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写 InsertWord(str); } } void CountStr::Print() { for (int i = 0;i < NHASH;i++) { for (StrNode* p = bin[i];p;p = p->next) { cout<<p->word<<" "<<p->count<<endl; } } } int main() { CountStr countStr; countStr.InitStr("word.txt"); countStr.Print(); system("pause"); return 1; }
trie树实现
#include <cstdlib> #include <iostream> #include <fstream> #include <string> #include <algorithm> #include <assert.h> using namespace std; const int MaxBranchNum = 26; /*定义trie树结点*/ class TrieNode { public: char* word; int count; TrieNode* nextBranch[MaxBranchNum]; public: TrieNode() : word(NULL),count(0) { memset(nextBranch,NULL,sizeof(TrieNode*) * MaxBranchNum); } }; /*定义类Trie*/ class Trie { public: Trie(); ~Trie(); void Insert(const char* str); void Print(); private: TrieNode* pRoot; private: void Destory(TrieNode* pRoot); void Print(TrieNode* pRoot); }; Trie::Trie() { pRoot = new TrieNode(); } Trie::~Trie() { Destory(pRoot); } /*注意*/ void Trie::Insert(const char* str) { assert(NULL != str); int index; TrieNode* pLoc = pRoot; for (int i = 0;str[i];i++) { index = str[i] - 'a';//如果区分大小写,可以扩展 if(index < 0 || index > MaxBranchNum)//不执行插入 { return; } if (NULL == pLoc->nextBranch[index]) { pLoc->nextBranch[index] = new TrieNode(); } pLoc = pLoc->nextBranch[index]; } if (NULL != pLoc->word)//单词已经出现过 { pLoc->count++; return; } else //单词没有出现过,应该插入单词 { pLoc->count++; pLoc->word = new char[strlen(str) + 1]; assert(NULL != pLoc->word); strcpy(pLoc->word,str); } } void Trie::Print() { Print(pRoot); } /*输出所有的单词*/ void Trie::Print(TrieNode* pRoot) { if (NULL == pRoot) { return; } //输出单词 if (NULL != pRoot->word) { if (strcmp(pRoot->word,"is") == 0) { cout<<"is"<<endl; } if (strcmp(pRoot->word,"it") == 0) { cout<<"it"<<endl; } cout<<pRoot->word<<" "<<pRoot->count<<endl; } //递归处理分支 for (int i = 0;i < MaxBranchNum;i++) { Print(pRoot->nextBranch[i]); } } /*销毁trie树*/ void Trie::Destory(TrieNode* pRoot) { if (NULL == pRoot) { return; } for (int i = 0;i < MaxBranchNum;i++) { Destory(pRoot->nextBranch[i]); } //销毁单词占得空间 if (NULL != pRoot->word) { delete []pRoot->word; pRoot->word = NULL; } delete pRoot;//销毁结点 pRoot = NULL; } int main(int argc, char *argv[]) { string str; Trie t; ifstream in("word.txt"); //把单词输入字典树 while(in >> str) { transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写 //cout<<str<<endl; t.Insert(str.c_str()); } //输出 t.Print(); system("PAUSE"); return 1; }