Trie / Aho-Corasick¶
Trie (prefix tree) and Aho-Corasick are powerful data structures for string processing, particularly useful for multiple pattern matching and prefix-based operations. The Aho-Corasick algorithm extends the trie with failure links for efficient multiple pattern matching.
Trie Data Structure¶
A trie is a tree-like data structure where each node represents a character, and paths from root to leaves represent strings.
Basic Trie Implementation¶
#include <unordered_map>
#include <string>
#include <vector>
using namespace std;
class TrieNode {
public:
unordered_map<char, TrieNode*> children;
bool isEndOfWord;
string word; // Store the complete word at end nodes
TrieNode() : isEndOfWord(false) {}
};
class Trie {
private:
TrieNode* root;
public:
Trie() {
root = new TrieNode();
}
// Insert a word into the trie
void insert(string word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new TrieNode();
}
current = current->children[c];
}
current->isEndOfWord = true;
current->word = word;
}
// Search for a word in the trie
bool search(string word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
return false;
}
current = current->children[c];
}
return current->isEndOfWord;
}
// Check if any word starts with the given prefix
bool startsWith(string prefix) {
TrieNode* current = root;
for (char c : prefix) {
if (current->children.find(c) == current->children.end()) {
return false;
}
current = current->children[c];
}
return true;
}
// Get all words with the given prefix
vector<string> getWordsWithPrefix(string prefix) {
vector<string> result;
TrieNode* current = root;
// Navigate to the prefix node
for (char c : prefix) {
if (current->children.find(c) == current->children.end()) {
return result; // No words with this prefix
}
current = current->children[c];
}
// DFS to collect all words from this node
collectWords(current, result);
return result;
}
private:
void collectWords(TrieNode* node, vector<string>& result) {
if (node->isEndOfWord) {
result.push_back(node->word);
}
for (auto& pair : node->children) {
collectWords(pair.second, result);
}
}
};
Aho-Corasick Algorithm¶
The Aho-Corasick algorithm extends the trie with failure links to efficiently find all occurrences of multiple patterns in a text.
Aho-Corasick Implementation¶
#include <unordered_map>
#include <string>
#include <vector>
#include <queue>
using namespace std;
class AhoCorasickNode {
public:
unordered_map<char, AhoCorasickNode*> children;
AhoCorasickNode* failure;
vector<string> output; // Patterns that end at this node
bool isEndOfWord;
AhoCorasickNode() : failure(nullptr), isEndOfWord(false) {}
};
class AhoCorasick {
private:
AhoCorasickNode* root;
public:
AhoCorasick() {
root = new AhoCorasickNode();
}
// Build the trie with patterns
void buildTrie(vector<string> patterns) {
for (string pattern : patterns) {
insertPattern(pattern);
}
buildFailureLinks();
}
// Insert a single pattern
void insertPattern(string pattern) {
AhoCorasickNode* current = root;
for (char c : pattern) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new AhoCorasickNode();
}
current = current->children[c];
}
current->isEndOfWord = true;
current->output.push_back(pattern);
}
// Build failure links using BFS
void buildFailureLinks() {
queue<AhoCorasickNode*> q;
// Initialize failure links for root's children
for (auto& pair : root->children) {
pair.second->failure = root;
q.push(pair.second);
}
while (!q.empty()) {
AhoCorasickNode* current = q.front();
q.pop();
for (auto& pair : current->children) {
char c = pair.first;
AhoCorasickNode* child = pair.second;
AhoCorasickNode* failure = current->failure;
// Find the longest proper suffix that is also a prefix
while (failure != root && failure->children.find(c) == failure->children.end()) {
failure = failure->failure;
}
if (failure->children.find(c) != failure->children.end()) {
child->failure = failure->children[c];
} else {
child->failure = root;
}
// Merge output functions
child->output.insert(child->output.end(),
child->failure->output.begin(),
child->failure->output.end());
q.push(child);
}
}
}
// Search for all patterns in text
vector<pair<string, int>> search(string text) {
vector<pair<string, int>> matches;
AhoCorasickNode* current = root;
for (int i = 0; i < text.length(); i++) {
char c = text[i];
// Follow failure links until we find a child with character c
while (current != root && current->children.find(c) == current->children.end()) {
current = current->failure;
}
if (current->children.find(c) != current->children.end()) {
current = current->children[c];
} else {
current = root;
}
// Add all patterns that end at current node
for (string pattern : current->output) {
matches.push_back({pattern, i - pattern.length() + 1});
}
}
return matches;
}
};
Example: Multiple Pattern Matching¶
Problem: Implement a trie for word storage and an Aho-Corasick automaton for multiple pattern matching.
Sample Input: - Trie operations: insert "hello", "world", "hell", "he" - Aho-Corasick patterns: ["he", "she", "his", "hers"] - Text: "ushers"
Sample Output:
Search 'hello': 1
Search 'hel': 0
Starts with 'he': 1
Words with prefix 'he': he hell hello
Patterns found in 'ushers':
Pattern 'he' at position 1
Pattern 'she' at position 2
Pattern 'hers' at position 3
#include <iostream>
#include <vector>
using namespace std;
int main() {
// Example usage of Trie
Trie trie;
trie.insert("hello");
trie.insert("world");
trie.insert("hell");
trie.insert("he");
cout << "Search 'hello': " << trie.search("hello") << endl;
cout << "Search 'hel': " << trie.search("hel") << endl;
cout << "Starts with 'he': " << trie.startsWith("he") << endl;
vector<string> words = trie.getWordsWithPrefix("he");
cout << "Words with prefix 'he': ";
for (string word : words) {
cout << word << " ";
}
cout << endl;
// Example usage of Aho-Corasick
AhoCorasick ac;
vector<string> patterns = {"he", "she", "his", "hers"};
ac.buildTrie(patterns);
string text = "ushers";
vector<pair<string, int>> matches = ac.search(text);
cout << "Patterns found in '" << text << "':" << endl;
for (auto& match : matches) {
cout << "Pattern '" << match.first << "' at position " << match.second << endl;
}
return 0;
}
Advanced Applications¶
Autocomplete System¶
class AutocompleteSystem {
private:
Trie trie;
public:
void addWord(string word) {
trie.insert(word);
}
vector<string> getSuggestions(string prefix) {
return trie.getWordsWithPrefix(prefix);
}
};
Word Search in 2D Grid¶
#include <vector>
#include <string>
using namespace std;
class WordSearch {
private:
vector<vector<char>> board;
int rows, cols;
vector<string> words;
vector<string> result;
public:
vector<string> findWords(vector<vector<char>>& board, vector<string>& words) {
this->board = board;
this->words = words;
rows = board.size();
cols = board[0].size();
// Build trie from words
Trie trie;
for (string word : words) {
trie.insert(word);
}
// Search from each cell
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
dfs(i, j, "", trie.root);
}
}
return result;
}
private:
void dfs(int row, int col, string current, TrieNode* node) {
if (row < 0 || row >= rows || col < 0 || col >= cols || board[row][col] == '#') {
return;
}
char c = board[row][col];
if (node->children.find(c) == node->children.end()) {
return;
}
current += c;
node = node->children[c];
if (node->isEndOfWord) {
result.push_back(current);
node->isEndOfWord = false; // Avoid duplicates
}
board[row][col] = '#'; // Mark as visited
// Explore all four directions
dfs(row + 1, col, current, node);
dfs(row - 1, col, current, node);
dfs(row, col + 1, current, node);
dfs(row, col - 1, current, node);
board[row][col] = c; // Restore
}
};
Use Cases¶
Trie Applications¶
- Autocomplete Systems: Finding words with given prefixes
- Spell Checkers: Validating word existence
- IP Routing: Longest prefix matching
- Text Compression: LZW algorithm
- Dictionary Operations: Fast word lookup
Aho-Corasick Applications¶
- Multiple Pattern Matching: Finding all patterns in text
- Virus Detection: Scanning for multiple virus signatures
- Text Mining: Extracting multiple keywords
- DNA Sequence Analysis: Finding multiple gene patterns
- Network Intrusion Detection: Pattern matching in network traffic
Complexity Analysis¶
Trie Operations¶
- Insert: O(m) where m is the length of the word
- Search: O(m) where m is the length of the word
- Prefix Search: O(m) where m is the length of the prefix
- Space: O(ALPHABET_SIZE * N * M) where N is number of words and M is average length
Aho-Corasick¶
- Preprocessing: O(sum of pattern lengths)
- Search: O(text length + number of matches)
- Space: O(sum of pattern lengths)
Optimization Tips¶
- Memory Optimization: Use array instead of hash map for small alphabets
- Compression: Implement compressed tries (radix trees) for space efficiency
- Lazy Loading: Build trie nodes only when needed
- Caching: Cache frequently accessed nodes
- Early Termination: Stop searching when no more matches are possible