Example 3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | /* * AhoCorasickPlus.h: This is the header file for a sample * C++ wrapper for Aho-Corasick C library * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com> multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see <http://www.gnu.org/licenses/>. */ #ifndef AHOCORASICKPPW_H_ #define AHOCORASICKPPW_H_ #include <string> #include <queue> // Forward declarations struct ac_trie; struct ac_text; class AhoCorasickPlus { public: enum EnumReturnStatus { RETURNSTATUS_SUCCESS = 0, // No error occurred RETURNSTATUS_DUPLICATE_PATTERN, // Duplicate patterns RETURNSTATUS_LONG_PATTERN, // Long pattern RETURNSTATUS_ZERO_PATTERN, // Empty pattern (zero length) RETURNSTATUS_AUTOMATA_CLOSED, // Automata is closed RETURNSTATUS_FAILED, // General unknown failure }; typedef unsigned int PatternId; struct Match { unsigned int position; PatternId id; }; public: AhoCorasickPlus(); ~AhoCorasickPlus(); EnumReturnStatus addPattern (const std::string &pattern, PatternId id); EnumReturnStatus addPattern (const char pattern[], PatternId id); void finalize (); void search (std::string& text, bool keep); bool findNext (Match& match); private: struct ac_trie *m_automata; struct ac_text *m_acText; std::queue<Match> m_matchQueue; // if multiple matches occur in a single // position we save them here and return // one by one for simplicity }; #endif /* AHOCORASICKPPW_H_ */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* * AhoCorasickPlus.cpp: A sample C++ wrapper for Aho-Corasick C library * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com> multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see <http://www.gnu.org/licenses/>. */ #include "ahocorasick.h" #include "AhoCorasickPlus.h" AhoCorasickPlus::AhoCorasickPlus () { m_automata = ac_trie_create (); m_acText = new AC_TEXT_t; } AhoCorasickPlus::~AhoCorasickPlus () { ac_trie_release (m_automata); delete m_acText; } AhoCorasickPlus::EnumReturnStatus AhoCorasickPlus::addPattern (const std::string &pattern, PatternId id) { // Adds zero-terminating string EnumReturnStatus rv = RETURNSTATUS_FAILED; AC_PATTERN_t patt; patt.ptext.astring = (AC_ALPHABET_t*) pattern.c_str(); patt.ptext.length = pattern.size(); patt.id.u.number = id; patt.rtext.astring = NULL; patt.rtext.length = 0; AC_STATUS_t status = ac_trie_add (m_automata, &patt, 0); switch (status) { case ACERR_SUCCESS: rv = RETURNSTATUS_SUCCESS; break; case ACERR_DUPLICATE_PATTERN: rv = RETURNSTATUS_DUPLICATE_PATTERN; break; case ACERR_LONG_PATTERN: rv = RETURNSTATUS_LONG_PATTERN; break; case ACERR_ZERO_PATTERN: rv = RETURNSTATUS_ZERO_PATTERN; break; case ACERR_TRIE_CLOSED: rv = RETURNSTATUS_AUTOMATA_CLOSED; break; } return rv; } AhoCorasickPlus::EnumReturnStatus AhoCorasickPlus::addPattern (const char pattern[], PatternId id) { std::string tmpString = pattern; return addPattern (tmpString, id); } void AhoCorasickPlus::finalize () { ac_trie_finalize (m_automata); } void AhoCorasickPlus::search (std::string& text, bool keep) { m_acText->astring = text.c_str(); m_acText->length = text.size(); ac_trie_settext (m_automata, m_acText, (int)keep); } bool AhoCorasickPlus::findNext (Match& match) { if (m_matchQueue.size() > 0) { match = m_matchQueue.front(); m_matchQueue.pop(); return true; } AC_MATCH_t matchp; if ((matchp = ac_trie_findnext (m_automata)).size) { Match singleMatch; singleMatch.position = matchp.position; for (unsigned int j = 0; j < matchp.size; j++) { singleMatch.id = matchp.patterns[j].id.u.number; m_matchQueue.push(singleMatch); } } if (m_matchQueue.size() > 0) { match = m_matchQueue.front(); m_matchQueue.pop(); return true; } return false; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | /* * example3.cpp: It shows how to use ahocorasick library with a C++ wrapper * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani <kamiar.kanani@gmail.com> multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see <http://www.gnu.org/licenses/>. */ #include <iostream> #include <string> #include "AhoCorasickPlus.h" std::string sample_patterns[] = { "city", "clutter", "ever", "experience", "neo", "one", "simplicity", "utter", "whatever", }; #define PATTERN_COUNT (sizeof(sample_patterns)/sizeof(std::string)) std::string chunk1 = "experience the ease and simplicity of multifast"; std::string chunk2 = "whatever you are be a good one"; std::string chunk3 = "out of clutter, find simplicity"; int main (int argc, char **argv) { AhoCorasickPlus trie; for (unsigned int i = 0; i < PATTERN_COUNT; i++) { AhoCorasickPlus::EnumReturnStatus status; AhoCorasickPlus::PatternId patId = i; status = trie.addPattern(sample_patterns[i], patId); if (status != AhoCorasickPlus::RETURNSTATUS_SUCCESS) { std::cout << "Failed to add: " << sample_patterns[i] << std::endl; } } trie.finalize(); AhoCorasickPlus::Match aMatch; std::cout << "Searching '" << chunk1 << "'" << std::endl; trie.search(chunk1, false); while (trie.findNext(aMatch)) { std::cout << "@" << aMatch.position << "\t#" << aMatch.id << "\t" << sample_patterns[aMatch.id] << std::endl; } std::cout << "Searching '" << chunk2 << "'" << std::endl; trie.search(chunk2, false); while (trie.findNext(aMatch)) { std::cout << "@" << aMatch.position << "\t#" << aMatch.id << "\t" << sample_patterns[aMatch.id] << std::endl; } std::cout << "Searching '" << chunk3 << "'" << std::endl; trie.search(chunk3, true); // try it with keep flag disabled while (trie.findNext(aMatch)) { std::cout << "@" << aMatch.position << "\t#" << aMatch.id << "\t" << sample_patterns[aMatch.id] << std::endl; } return 0; } |