Suffix Automaton

Suffix Automaton (SAM) data structure for Advanced String Problems

Suffix Automaton (SAM)

Suffix Automaton is the smallest finite automaton that accepts all suffixes of string $s$ .

Properties

For string of length $n$ :

Has at most $2n - 1$ states
Has at most $3n - 4$ transitions
Can be built in $O(n)$ time and space

Structure

Each state stores:

link (suffix link): points to state representing longest proper suffix
len: length of longest string this state represents
next[c]: transition to next state when reading character c

Implementation

struct SuffixAutomaton {
    struct State {
        int len, link;
        map<char, int> next;
        // or use int next[26] for lowercase letters
    };
    
    vector<State> st;
    int last;
    
    SuffixAutomaton() {
        st.push_back({0, -1, {}});  // Initial state
        last = 0;
    }
    
    void extend(char c) {
        int cur = st.size();
        st.push_back({st[last].len + 1, -1, {}});
        
        int p = last;
        while (p != -1 && !st[p].next.count(c)) {
            st[p].next[c] = cur;
            p = st[p].link;
        }
        
        if (p == -1) {
            st[cur].link = 0;
        } else {
            int q = st[p].next[c];
            if (st[p].len + 1 == st[q].len) {
                st[cur].link = q;
            } else {
                int clone = st.size();
                st.push_back({st[p].len + 1, st[q].link, st[q].next});
                
                while (p != -1 && st[p].next[c] == q) {
                    st[p].next[c] = clone;
                    p = st[p].link;
                }
                
                st[q].link = st[cur].link = clone;
            }
        }
        
        last = cur;
    }
    
    void build(const string& s) {
        for (char c : s) {
            extend(c);
        }
    }
};

Concept

Endpos Equivalence Class

endpos(t) = set of positions where substring $t$ ends
States in SAM represent equivalence classes of strings with the same endpos

Suffix Links

Suffix link of state $v$ points to state representing longest proper suffix of strings in $v$ that belongs to another equivalence class.

Example: s = "abab"

State 0 (initial): ""
State 1: "a" (endpos = {0, 2})
State 2: "ab" (endpos = {1, 3})
State 3: "aba" → "abab" (endpos = {3})
State 4: "b" (endpos = {1, 3})
State 5: "bab" (endpos = {3})

Suffix links:
3 → 2 → 1 → 0
5 → 4 → 0

Applications

1. Check if Pattern is a Substring

bool contains(const SuffixAutomaton& sam, const string& pattern) {
    int cur = 0;
    for (char c : pattern) {
        if (!sam.st[cur].next.count(c)) {
            return false;
        }
        cur = sam.st[cur].next.at(c);
    }
    return true;
}

2. Count Distinct Substrings

long long countDistinct(const SuffixAutomaton& sam) {
    long long count = 0;
    for (int i = 1; i < sam.st.size(); i++) {
        count += sam.st[i].len - sam.st[sam.st[i].link].len;
    }
    return count;
}

3. Count Number of Occurrences

struct SAMWithCount : SuffixAutomaton {
    vector<int> cnt;  // number of occurrences
    
    void computeCounts() {
        int n = st.size();
        cnt.assign(n, 0);
        
        // Mark states that correspond to prefixes
        int cur = 0;
        // ... (mark terminal states)
        
        // Topological sort by len
        vector<int> order(n);
        iota(order.begin(), order.end(), 0);
        sort(order.begin(), order.end(), [&](int a, int b) {
            return st[a].len > st[b].len;
        });
        
        // Propagate counts via suffix links
        for (int v : order) {
            if (st[v].link >= 0) {
                cnt[st[v].link] += cnt[v];
            }
        }
    }
    
    int countOccurrences(const string& pattern) {
        int cur = 0;
        for (char c : pattern) {
            if (!st[cur].next.count(c)) return 0;
            cur = st[cur].next.at(c);
        }
        return cnt[cur];
    }
};

4. Find Longest Common Substring

string longestCommonSubstring(const string& a, const string& b) {
    SuffixAutomaton sam;
    sam.build(a);
    
    int cur = 0, len = 0;
    int bestLen = 0, bestPos = 0;
    
    for (int i = 0; i < b.size(); i++) {
        char c = b[i];
        
        while (cur != 0 && !sam.st[cur].next.count(c)) {
            cur = sam.st[cur].link;
            len = sam.st[cur].len;
        }
        
        if (sam.st[cur].next.count(c)) {
            cur = sam.st[cur].next[c];
            len++;
        }
        
        if (len > bestLen) {
            bestLen = len;
            bestPos = i - len + 1;
        }
    }
    
    return b.substr(bestPos, bestLen);
}

5. Lexicographically K-th Substring

string kthSubstring(SuffixAutomaton& sam, long long k) {
    int n = sam.st.size();
    vector<long long> paths(n);
    
    // Count paths (substrings) from each state
    // Topological order by len (decreasing)
    vector<int> order(n);
    iota(order.begin(), order.end(), 0);
    sort(order.begin(), order.end(), [&](int a, int b) {
        return sam.st[a].len > sam.st[b].len;
    });
    
    for (int v : order) {
        paths[v] = 1;  // empty string from this state
        for (auto& [c, u] : sam.st[v].next) {
            paths[v] += paths[u];
        }
    }
    
    // Find k-th substring
    string result;
    int cur = 0;
    
    while (k > 0) {
        for (auto& [c, u] : sam.st[cur].next) {
            if (paths[u] >= k) {
                result += c;
                k--;  // count this transition
                cur = u;
                break;
            }
            k -= paths[u];
        }
    }
    
    return result;
}

Complexity

Operation	Time	Space
Build	$O(n)$	$O(n)$ or $O(n \cdot \Sigma)$
Check substring	$O(m)$	$O(1)$
Count distinct	$O(n)$	$O(1)$

Tips

SAM vs Suffix Array: SAM is better for online queries, SA is better for LCP queries
Memory: use map for large alphabet, array for small alphabet
Clone states: understand clone operation to debug effectively
Suffix links form a tree: can use DFS/BFS on suffix link tree