Aho-Corasick Algorithm
Aho-Corasick Algorithm for Multiple Pattern Matching
Aho-Corasick Algorithm
Aho-Corasick is an algorithm for searching multiple patterns in text simultaneously in time complexity where is the total number of matches.
Main Concept
Combines Trie with KMP failure links into an automaton that can search multiple patterns at once.
Structure
Each node stores:
children[c]: child for character cfail: failure link (similar to KMP)output: list of patterns that end at this node
Implementation
struct AhoCorasick {
struct Node {
int children[26];
int fail;
vector<int> output; // pattern indices that end here
Node() {
fill(children, children + 26, -1);
fail = 0;
}
};
vector<Node> trie;
AhoCorasick() {
trie.push_back(Node()); // root
}
// Add pattern with given index
void addPattern(const string& pattern, int idx) {
int cur = 0;
for (char c : pattern) {
int ch = c - 'a';
if (trie[cur].children[ch] == -1) {
trie[cur].children[ch] = trie.size();
trie.push_back(Node());
}
cur = trie[cur].children[ch];
}
trie[cur].output.push_back(idx);
}
// Build failure links using BFS
void build() {
queue<int> q;
// Initialize: children of root have fail = 0
for (int c = 0; c < 26; c++) {
if (trie[0].children[c] != -1) {
trie[trie[0].children[c]].fail = 0;
q.push(trie[0].children[c]);
}
}
while (!q.empty()) {
int u = q.front();
q.pop();
for (int c = 0; c < 26; c++) {
int v = trie[u].children[c];
if (v == -1) continue;
// Find failure link
int f = trie[u].fail;
while (f != 0 && trie[f].children[c] == -1) {
f = trie[f].fail;
}
if (trie[f].children[c] != -1 && trie[f].children[c] != v) {
trie[v].fail = trie[f].children[c];
} else {
trie[v].fail = 0;
}
// Merge output lists
for (int idx : trie[trie[v].fail].output) {
trie[v].output.push_back(idx);
}
q.push(v);
}
}
}
// Search text and return all matches
vector<pair<int, int>> search(const string& text) {
vector<pair<int, int>> matches; // (position, pattern_index)
int cur = 0;
for (int i = 0; i < text.size(); i++) {
int c = text[i] - 'a';
// Follow failure links until match or root
while (cur != 0 && trie[cur].children[c] == -1) {
cur = trie[cur].fail;
}
if (trie[cur].children[c] != -1) {
cur = trie[cur].children[c];
}
// Report all patterns ending at this position
for (int idx : trie[cur].output) {
matches.push_back({i, idx});
}
}
return matches;
}
};
Optimized Version (with go array)
Add go[v][c] to avoid following multiple failure links:
struct AhoCorasickOptimized {
vector<array<int, 26>> go;
vector<int> fail;
vector<vector<int>> output;
int size;
AhoCorasickOptimized() {
go.push_back({});
fill(go[0].begin(), go[0].end(), 0);
fail.push_back(0);
output.push_back({});
size = 1;
}
void addPattern(const string& s, int idx) {
int cur = 0;
for (char ch : s) {
int c = ch - 'a';
if (go[cur][c] == 0) {
go[cur][c] = size++;
go.push_back({});
fill(go.back().begin(), go.back().end(), 0);
fail.push_back(0);
output.push_back({});
}
cur = go[cur][c];
}
output[cur].push_back(idx);
}
void build() {
queue<int> q;
for (int c = 0; c < 26; c++) {
if (go[0][c] != 0) {
q.push(go[0][c]);
}
}
while (!q.empty()) {
int u = q.front();
q.pop();
for (int c = 0; c < 26; c++) {
if (go[u][c] != 0) {
int v = go[u][c];
int f = fail[u];
while (f && go[f][c] == 0) f = fail[f];
fail[v] = go[f][c];
if (fail[v] == v) fail[v] = 0;
// Merge outputs
for (int x : output[fail[v]]) {
output[v].push_back(x);
}
q.push(v);
} else {
// Compute go function
int f = fail[u];
while (f && go[f][c] == 0) f = fail[f];
go[u][c] = go[f][c];
}
}
}
}
vector<pair<int, int>> search(const string& text) {
vector<pair<int, int>> matches;
int cur = 0;
for (int i = 0; i < text.size(); i++) {
cur = go[cur][text[i] - 'a'];
for (int idx : output[cur]) {
matches.push_back({i, idx});
}
}
return matches;
}
};
Applications
1. Multiple Pattern Search
void findAllPatterns(const string& text, vector<string>& patterns) {
AhoCorasick ac;
for (int i = 0; i < patterns.size(); i++) {
ac.addPattern(patterns[i], i);
}
ac.build();
auto matches = ac.search(text);
for (auto& [pos, idx] : matches) {
cout << "Pattern '" << patterns[idx] << "' found at position "
<< pos - patterns[idx].size() + 1 << endl;
}
}
2. Count Occurrences of Each Pattern
vector<int> countOccurrences(const string& text, vector<string>& patterns) {
AhoCorasick ac;
for (int i = 0; i < patterns.size(); i++) {
ac.addPattern(patterns[i], i);
}
ac.build();
vector<int> count(patterns.size(), 0);
auto matches = ac.search(text);
for (auto& [pos, idx] : matches) {
count[idx]++;
}
return count;
}
3. Find Minimum Patterns to Cover Text
int minPatternsToCover(const string& text, vector<string>& patterns) {
// Build AC automaton
// DP: dp[i] = min patterns to cover text[0..i-1]
// ...
}
Complexity
| Operation | Time | Space |
|---|---|---|
| Add pattern | ||
| Build | ||
| Search |
Where:
- = total length of all patterns
- = alphabet size
- = number of matches
Tips
- Failure links are similar to KMP - used to find longest proper suffix that is a prefix of any pattern
- Output merging - must merge outputs from failure chain or you’ll miss matches
- Go array - precompute transitions to speed up search (no need to follow failure links)
- Memory - for large alphabet, use map instead of array