LOGO

Multifast 2.0.0

A Total Solution for
Mass String Search & Substitution

Download

Example 3

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/*
 * AhoCorasickPlus.h: This is the header file for a sample
 * C++ wrapper for Aho-Corasick C library
 *
 * This file is part of multifast.
 *
    Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com>

    multifast is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    multifast is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with multifast.  If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef AHOCORASICKPPW_H_
#define AHOCORASICKPPW_H_

#include <string>
#include <queue>

// Forward declarations
struct ac_trie;
struct ac_text;


class AhoCorasickPlus
{
public:
    
    enum EnumReturnStatus
    {
        RETURNSTATUS_SUCCESS = 0,       // No error occurred
        RETURNSTATUS_DUPLICATE_PATTERN, // Duplicate patterns
        RETURNSTATUS_LONG_PATTERN,      // Long pattern
        RETURNSTATUS_ZERO_PATTERN,      // Empty pattern (zero length)
        RETURNSTATUS_AUTOMATA_CLOSED,   // Automata is closed
        RETURNSTATUS_FAILED,            // General unknown failure
    };
    
    typedef unsigned int PatternId;
    
    struct Match
    {
        unsigned int    position;
        PatternId       id;
    };
    
public:
    
    AhoCorasickPlus();
    ~AhoCorasickPlus();
    
    EnumReturnStatus addPattern (const std::string &pattern, PatternId id);
    EnumReturnStatus addPattern (const char pattern[], PatternId id);
    void             finalize   ();
    
    void search   (std::string& text, bool keep);
    bool findNext (Match& match);
    
private:
    struct ac_trie      *m_automata;
    struct ac_text      *m_acText;
    std::queue<Match>   m_matchQueue;   // if multiple matches occur in a single
                                        // position we save them here and return
                                        // one by one for simplicity
};

#endif /* AHOCORASICKPPW_H_ */

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*
 * AhoCorasickPlus.cpp: A sample C++ wrapper for Aho-Corasick C library 
 * 
 * This file is part of multifast.
 *
    Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com>

    multifast is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    multifast is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with multifast.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "ahocorasick.h"
#include "AhoCorasickPlus.h"

AhoCorasickPlus::AhoCorasickPlus ()
{
    m_automata = ac_trie_create ();
    m_acText = new AC_TEXT_t;
}

AhoCorasickPlus::~AhoCorasickPlus ()
{
    ac_trie_release (m_automata);
    delete m_acText;
}

AhoCorasickPlus::EnumReturnStatus AhoCorasickPlus::addPattern 
    (const std::string &pattern, PatternId id)
{
    // Adds zero-terminating string
    
    EnumReturnStatus rv = RETURNSTATUS_FAILED;
    
    AC_PATTERN_t patt;
    patt.ptext.astring = (AC_ALPHABET_t*) pattern.c_str();
    patt.ptext.length = pattern.size();
    patt.id.u.number = id;
    patt.rtext.astring = NULL;
    patt.rtext.length = 0;
    
    AC_STATUS_t status = ac_trie_add (m_automata, &patt, 0);
    
    switch (status)
    {
        case ACERR_SUCCESS: 
            rv = RETURNSTATUS_SUCCESS; 
            break;
        case ACERR_DUPLICATE_PATTERN:
            rv = RETURNSTATUS_DUPLICATE_PATTERN; 
            break;
        case ACERR_LONG_PATTERN: 
            rv = RETURNSTATUS_LONG_PATTERN; 
            break;
        case ACERR_ZERO_PATTERN: 
            rv = RETURNSTATUS_ZERO_PATTERN; 
            break;
        case ACERR_TRIE_CLOSED: 
            rv = RETURNSTATUS_AUTOMATA_CLOSED; 
            break;
    }
    return rv;
}

AhoCorasickPlus::EnumReturnStatus AhoCorasickPlus::addPattern 
    (const char pattern[], PatternId id)
{
    std::string tmpString = pattern;
    return addPattern (tmpString, id);
}

void AhoCorasickPlus::finalize ()
{
    ac_trie_finalize (m_automata);
}

void AhoCorasickPlus::search (std::string& text, bool keep)
{
    m_acText->astring = text.c_str();
    m_acText->length = text.size();
    ac_trie_settext (m_automata, m_acText, (int)keep);
}

bool AhoCorasickPlus::findNext (Match& match)
{
    if (m_matchQueue.size() > 0)
    {
        match = m_matchQueue.front();
        m_matchQueue.pop();
        return true;
    }
    
    AC_MATCH_t matchp;
    
    if ((matchp = ac_trie_findnext (m_automata)).size)
    {
        Match singleMatch;
        singleMatch.position = matchp.position;
        
        for (unsigned int j = 0; j < matchp.size; j++)
        {
            singleMatch.id = matchp.patterns[j].id.u.number;
            m_matchQueue.push(singleMatch);
        }
    }
    
    if (m_matchQueue.size() > 0)
    {
        match = m_matchQueue.front();
        m_matchQueue.pop();
        return true;
    }
    
    return false;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/*
 * example3.cpp: It shows how to use ahocorasick library with a C++ wrapper
 * 
 * This file is part of multifast.
 *
    Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com>

    multifast is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    multifast is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with multifast.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <iostream>
#include <string>
#include "AhoCorasickPlus.h"

std::string sample_patterns[] = {
    "city",
    "clutter",
    "ever",
    "experience",
    "neo",
    "one",
    "simplicity",
    "utter",
    "whatever",
};
#define PATTERN_COUNT (sizeof(sample_patterns)/sizeof(std::string))

std::string chunk1 = "experience the ease and simplicity of multifast";
std::string chunk2 = "whatever you are be a good one";
std::string chunk3 = "out of clutter, find simplicity";


int main (int argc, char **argv)
{
    AhoCorasickPlus trie;

    for (unsigned int i = 0; i < PATTERN_COUNT; i++)
    {
        AhoCorasickPlus::EnumReturnStatus status;
        AhoCorasickPlus::PatternId patId = i;
        status = trie.addPattern(sample_patterns[i], patId);
        if (status != AhoCorasickPlus::RETURNSTATUS_SUCCESS)
        {
            std::cout << "Failed to add: " << sample_patterns[i] << std::endl;
        }
    }
    trie.finalize();
    
    AhoCorasickPlus::Match aMatch;
    
    std::cout << "Searching '" << chunk1 << "'" << std::endl;
    trie.search(chunk1, false);
    while (trie.findNext(aMatch))
    {
        std::cout 
                << "@" << aMatch.position 
                << "\t#" << aMatch.id 
                << "\t" << sample_patterns[aMatch.id] 
                << std::endl;
    }
    
    std::cout << "Searching '" << chunk2 << "'" << std::endl;
    trie.search(chunk2, false);
    while (trie.findNext(aMatch))
    {
        std::cout 
                << "@" << aMatch.position 
                << "\t#" << aMatch.id 
                << "\t" << sample_patterns[aMatch.id] 
                << std::endl;
    }
    
    std::cout << "Searching '" << chunk3 << "'" << std::endl;
    trie.search(chunk3, true); // try it with keep flag disabled
    while (trie.findNext(aMatch))
    {
        std::cout 
                << "@" << aMatch.position 
                << "\t#" << aMatch.id 
                << "\t" << sample_patterns[aMatch.id] 
                << std::endl;
    }
    
    return 0;
}