Suffix Automaton
Suffix Automaton (SAM) data structure for Advanced String Problems
Suffix Automaton (SAM)
Suffix Automaton is the smallest finite automaton that accepts all suffixes of string .
Properties
For string of length :
- Has at most states
- Has at most transitions
- Can be built in time and space
Structure
Each state stores:
link(suffix link): points to state representing longest proper suffixlen: length of longest string this state representsnext[c]: transition to next state when reading character c
Implementation
struct SuffixAutomaton {
struct State {
int len, link;
map<char, int> next;
// or use int next[26] for lowercase letters
};
vector<State> st;
int last;
SuffixAutomaton() {
st.push_back({0, -1, {}}); // Initial state
last = 0;
}
void extend(char c) {
int cur = st.size();
st.push_back({st[last].len + 1, -1, {}});
int p = last;
while (p != -1 && !st[p].next.count(c)) {
st[p].next[c] = cur;
p = st[p].link;
}
if (p == -1) {
st[cur].link = 0;
} else {
int q = st[p].next[c];
if (st[p].len + 1 == st[q].len) {
st[cur].link = q;
} else {
int clone = st.size();
st.push_back({st[p].len + 1, st[q].link, st[q].next});
while (p != -1 && st[p].next[c] == q) {
st[p].next[c] = clone;
p = st[p].link;
}
st[q].link = st[cur].link = clone;
}
}
last = cur;
}
void build(const string& s) {
for (char c : s) {
extend(c);
}
}
};
Concept
Endpos Equivalence Class
- endpos(t) = set of positions where substring ends
- States in SAM represent equivalence classes of strings with the same endpos
Suffix Links
Suffix link of state points to state representing longest proper suffix of strings in that belongs to another equivalence class.
Example: s = "abab"
State 0 (initial): ""
State 1: "a" (endpos = {0, 2})
State 2: "ab" (endpos = {1, 3})
State 3: "aba" → "abab" (endpos = {3})
State 4: "b" (endpos = {1, 3})
State 5: "bab" (endpos = {3})
Suffix links:
3 → 2 → 1 → 0
5 → 4 → 0
Applications
1. Check if Pattern is a Substring
bool contains(const SuffixAutomaton& sam, const string& pattern) {
int cur = 0;
for (char c : pattern) {
if (!sam.st[cur].next.count(c)) {
return false;
}
cur = sam.st[cur].next.at(c);
}
return true;
}
2. Count Distinct Substrings
long long countDistinct(const SuffixAutomaton& sam) {
long long count = 0;
for (int i = 1; i < sam.st.size(); i++) {
count += sam.st[i].len - sam.st[sam.st[i].link].len;
}
return count;
}
3. Count Number of Occurrences
struct SAMWithCount : SuffixAutomaton {
vector<int> cnt; // number of occurrences
void computeCounts() {
int n = st.size();
cnt.assign(n, 0);
// Mark states that correspond to prefixes
int cur = 0;
// ... (mark terminal states)
// Topological sort by len
vector<int> order(n);
iota(order.begin(), order.end(), 0);
sort(order.begin(), order.end(), [&](int a, int b) {
return st[a].len > st[b].len;
});
// Propagate counts via suffix links
for (int v : order) {
if (st[v].link >= 0) {
cnt[st[v].link] += cnt[v];
}
}
}
int countOccurrences(const string& pattern) {
int cur = 0;
for (char c : pattern) {
if (!st[cur].next.count(c)) return 0;
cur = st[cur].next.at(c);
}
return cnt[cur];
}
};
4. Find Longest Common Substring
string longestCommonSubstring(const string& a, const string& b) {
SuffixAutomaton sam;
sam.build(a);
int cur = 0, len = 0;
int bestLen = 0, bestPos = 0;
for (int i = 0; i < b.size(); i++) {
char c = b[i];
while (cur != 0 && !sam.st[cur].next.count(c)) {
cur = sam.st[cur].link;
len = sam.st[cur].len;
}
if (sam.st[cur].next.count(c)) {
cur = sam.st[cur].next[c];
len++;
}
if (len > bestLen) {
bestLen = len;
bestPos = i - len + 1;
}
}
return b.substr(bestPos, bestLen);
}
5. Lexicographically K-th Substring
string kthSubstring(SuffixAutomaton& sam, long long k) {
int n = sam.st.size();
vector<long long> paths(n);
// Count paths (substrings) from each state
// Topological order by len (decreasing)
vector<int> order(n);
iota(order.begin(), order.end(), 0);
sort(order.begin(), order.end(), [&](int a, int b) {
return sam.st[a].len > sam.st[b].len;
});
for (int v : order) {
paths[v] = 1; // empty string from this state
for (auto& [c, u] : sam.st[v].next) {
paths[v] += paths[u];
}
}
// Find k-th substring
string result;
int cur = 0;
while (k > 0) {
for (auto& [c, u] : sam.st[cur].next) {
if (paths[u] >= k) {
result += c;
k--; // count this transition
cur = u;
break;
}
k -= paths[u];
}
}
return result;
}
Complexity
| Operation | Time | Space |
|---|---|---|
| Build | or | |
| Check substring | ||
| Count distinct |
Tips
- SAM vs Suffix Array: SAM is better for online queries, SA is better for LCP queries
- Memory: use
mapfor large alphabet, array for small alphabet - Clone states: understand clone operation to debug effectively
- Suffix links form a tree: can use DFS/BFS on suffix link tree