Click here to Skip to main content
15,911,531 members
Please Sign up or sign in to vote.
1.10/5 (5 votes)
See more:
C++
#include "stdafx.h"
#include <conio.h>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <set>
#include <Windows.h>
#include <sstream>
#include <algorithm>
#include "hash_map"
#include <stdio.h>
using namespace std;
string b1 = "<BODY>", b2 = "</BODY>";
#define  FINPUT "input.txt"
#define  FOUTPUT "output.txt"
#define  STOPWORDS  "stopword.txt"
#define  START_DOC "<BODY>"
#define  END_DOC "</BODY>"
#define  START_DOC_LEN b1.length()
#define  END_DOC_LEN b2.length()
class CountOfWrd<code></code>
{
public:
    int DocNum;
    int Repeat;
    vector<int> Positions;
};
vector<string> vec_stop;
vector<string> WordsForSort;
hash_map <string, vector<CountOfWrd>> hmap;
hash_map <string, vector<CountOfWrd>> :: iterator hmap_AcIter;
typedef pair <string, vector<CountOfWrd>> Word_Pair;
void syntax(string &word);
inline string stem(string word);
inline void caps(string &word);
bool is_stop(string& word);
int _tmain(int argc, _TCHAR* argv[])
{
    ifstream file_of_stopwords(STOPWORDS);
    string str_stop;
    while( file_of_stopwords >> str_stop )
        vec_stop.push_back(str_stop);
    CountOfWrd TempCw;
    ifstream in(FINPUT);
    string word;
    int Doc = 0;
    int Pos=0;
    int startPos;
    vector<CountOfWrd> TmpPushPos;
    while(in >> word)
        if(word.find(START_DOC) != string::npos){
            if(word.length() > START_DOC_LEN){
                startPos = word.find(START_DOC);
                word = word.substr (startPos + START_DOC_LEN );
                Pos = 0;
                do{
                Pos ++;
                if(is_stop(word) == true)
                    continue;
                syntax(word);
                if(word.length() > 2)
                stem(word);
                caps(word);
                if(is_stop(word) == true)
                    continue;
                hmap_AcIter = hmap.find(word);
                if(hmap_AcIter != hmap.end())
                {if(hmap_AcIter->second[hmap_AcIter->second.size()-1].DocNum == Doc)
                    {   (hmap_AcIter)->second[hmap_AcIter->second.size()-1].Repeat++;
                        (hmap_AcIter->second[hmap_AcIter->second.size()-1].Positions).push_back(Pos);}
                    else{
                        TempCw.Repeat = 1;
                        TempCw.DocNum = Doc;
                        TempCw.Positions.clear();
                        TempCw.Positions.push_back(Pos);
                        ((hmap_AcIter)->second).push_back(TempCw);}}
                else
                {
                    TempCw.DocNum = Doc;
                    TempCw.Positions.clear();
                    TempCw.Positions.push_back(Pos);
                    TempCw.Repeat = 1;
                    TmpPushPos.clear();
                    TmpPushPos.push_back(TempCw);
                    hmap.insert(Word_Pair(word,TmpPushPos));
                    WordsForSort.push_back(word);
                }

            }while(in >> word && word.find(END_DOC) == string::npos);
        }
    make_heap(WordsForSort.begin(), WordsForSort.end());
    sort_heap(WordsForSort.begin(), WordsForSort.end());
    int start_of_index=0;
    ofstream out;
    out.open(FOUTPUT);
    for(unsigned int i = start_of_index; i < WordsForSort.size(); i++){
        hmap_AcIter = hmap.find(WordsForSort[i]);
        out << hmap_AcIter->first << "\t\t";
        for(unsigned int j = 0; j < hmap_AcIter->second.size(); j++){
            out << "[" <<hmap_AcIter->second[j].DocNum << "," << hmap_AcIter->second[j].Repeat << "(";
                for(unsigned int k = 0; k < hmap_AcIter->second[j].Positions.size(); k++){
                out << hmap_AcIter->second[j].Positions[k];
                        if(k !=hmap_AcIter->second[j].Positions.size()-1)
                            out << ",";}
                    out << ")" << "]" << "\t";}
        out<< "\n";
        }
    out.close();
    _getch();
    return 0;}
inline string stem(string word){
    if((word[word.length()-3] == 'i' || word[word.length()-3] == 'I') && (word[word.length()-2] == 'n' || word[word.length()-2] == 'N') && (word[word.length()-1] == 'g' || word[word.length()-1] == 'G'))
            word.erase(word.length()-3, 3);
    return word;}
inline void caps(string &word){
for(unsigned int i=0; i<word.size(); i++)
        if( isupper(word[i]))
        word[i] = tolower( word[i] );}
bool is_stop(string& word)
{for(unsigned int i=0; i<vec_stop.size(); i++)
        if( word == vec_stop[i] )
            return true;
    return false;}
void syntax(string &word){
for(unsigned int i=0; i<word.size(); i++){
        if( word[i] == '.'||word[i] == '#' ||word[i] == '&'|| word[i] == ',' || word[i] == ';'
            || word[i] == ')' || word[i] == '(' || word[i] == ':' || word[i] == '-' || word[i] == '"' ||  word[i] == '/' || word[i] == '?'
            || word[i] == '\'' || word[i] == '\"' || (word[i] >= 48 && word[i] <= 57))
            {word.erase(i, 1);
            i--;}
            }
    return;}
Posted
Updated 30-Dec-13 12:03pm
v3
Comments
mahla.r 30-Dec-13 16:28pm    
here is Inverted Index code , this Index in 6 minute ,i want get in 2 minute or less
Ron Beyer 30-Dec-13 16:59pm    
Wow that's a mess...
Sergey Alexandrovich Kryukov 30-Dec-13 19:19pm    
This looks so messy that even the best throughput (even if it was possible with code like that) would not present enough reward. :-)
Anyway, the question does not make enough sense. You would need to explain what do you want to achieve, to start with...
—SA
Dimitris Vasiliadis 30-Dec-13 20:18pm    
You can't get better speed unless the code compiles first without errors.
[no name] 30-Dec-13 20:26pm    
Speeding up an operation is not waving a magic wand. You must determine where the bottlenecks are. This is only partly possible by looking at code. Profiling is the next step. Is your 6 minutes measured with a huge input file? If so you may already be close to optimum for your algorithm and a redesign or faster hardware may be the only way to gain speed.

1 solution

C++
inline string stem(string word){
    if((word[word.length()-3] == 'i' || word[word.length()-3] == 'I') && (word[word.length()-2] == 'n' || word[word.length()-2] == 'N') && (word[word.length()-1] == 'g' || word[word.length()-1] == 'G'))
            word.erase(word.length()-3, 3);
    return word;}

Why are you using code like this when the run time libraries already provide optimised functions for comparing strings without regard to cas? Read the documentation, either for the string class, or the strXXX functions. Similarly the isXXX functions will determine if a string contains control characters, is all digits etc.
 
Share this answer
 
Comments
mahla.r 31-Dec-13 10:53am    
for hash map part , what can i do to get faster speed ?
Richard MacCutchan 31-Dec-13 11:49am    
It depends what you are trying to achieve. Maybe use a different collection type, or even a simple array.
mahla.r 31-Dec-13 15:09pm    
for build full text search engine ,how i change my code for better performance?
input file is reuter it's about 50Mb
and output is text file that index and sort all of the word Except stop word
it last about 6 minute
Richard MacCutchan 1-Jan-14 7:17am    
You code is very difficult to understand. I would suggest adding some time sampling in there to see which operations are the slowest and then work on redesigning those parts one at a time to see if that improves things. BTW what is "reuter"?
mahla.r 2-Jan-14 17:19pm    
reuter is a corpus

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900