ICPC Competitive Programming

Suffix Array

Suffix Array for Advanced String Problems

Suffix Array

Suffix Array is a data structure that stores all suffixes of a string in lexicographically sorted order.

Definition

For string ss of length nn:

  • Suffix ii = substring s[i..n1]s[i..n-1]
  • Suffix Array SASA = array of indices such that s[SA[0]]<s[SA[1]]<<s[SA[n1]]s[SA[0]] < s[SA[1]] < \ldots < s[SA[n-1]] (lexicographically)

Example

String: "banana"

iSuffix
0banana
1anana
2nana
3ana
4na
5a

Sorted suffixes:

  1. a (index 5)
  2. ana (index 3)
  3. anana (index 1)
  4. banana (index 0)
  5. na (index 4)
  6. nana (index 2)

Suffix Array = [5, 3, 1, 0, 4, 2]

Construction Algorithm

O(n log^2 n) - Simple Version

vector<int> buildSuffixArray(const string& s) {
    int n = s.size();
    vector<int> sa(n), rank(n), tmp(n);
    
    // Initial ranking by first character
    for (int i = 0; i < n; i++) {
        sa[i] = i;
        rank[i] = s[i];
    }
    
    // Double the length each iteration
    for (int k = 1; k < n; k *= 2) {
        // Sort by (rank[i], rank[i+k])
        auto cmp = [&](int a, int b) {
            if (rank[a] != rank[b]) return rank[a] < rank[b];
            int ra = a + k < n ? rank[a + k] : -1;
            int rb = b + k < n ? rank[b + k] : -1;
            return ra < rb;
        };
        
        sort(sa.begin(), sa.end(), cmp);
        
        // Update ranks
        tmp[sa[0]] = 0;
        for (int i = 1; i < n; i++) {
            tmp[sa[i]] = tmp[sa[i-1]] + (cmp(sa[i-1], sa[i]) ? 1 : 0);
        }
        rank = tmp;
    }
    
    return sa;
}

O(n log n) - Optimized with Counting Sort

vector<int> suffixArray(const string& s) {
    int n = s.size();
    const int ALPHA = 256;
    
    vector<int> sa(n), rank(n), tmp(n), cnt(max(n, ALPHA));
    
    // Initial sort by first character
    for (int i = 0; i < n; i++) cnt[s[i]]++;
    for (int i = 1; i < ALPHA; i++) cnt[i] += cnt[i-1];
    for (int i = n-1; i >= 0; i--) sa[--cnt[s[i]]] = i;
    
    rank[sa[0]] = 0;
    for (int i = 1; i < n; i++) {
        rank[sa[i]] = rank[sa[i-1]] + (s[sa[i]] != s[sa[i-1]]);
    }
    
    for (int k = 1; k < n; k *= 2) {
        // Sort by second key (rank[i+k])
        fill(cnt.begin(), cnt.end(), 0);
        for (int i = 0; i < n; i++) {
            int key = i + k < n ? rank[i + k] + 1 : 0;
            cnt[key]++;
        }
        for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
        for (int i = n-1; i >= 0; i--) {
            int key = sa[i] + k < n ? rank[sa[i] + k] + 1 : 0;
            tmp[--cnt[key]] = sa[i];
        }
        
        // Sort by first key (rank[i])
        fill(cnt.begin(), cnt.end(), 0);
        for (int i = 0; i < n; i++) cnt[rank[i]]++;
        for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
        for (int i = n-1; i >= 0; i--) {
            sa[--cnt[rank[tmp[i]]]] = tmp[i];
        }
        
        // Update ranks
        tmp[sa[0]] = 0;
        for (int i = 1; i < n; i++) {
            int prev = sa[i-1], cur = sa[i];
            bool same = rank[prev] == rank[cur];
            if (same) {
                int p2 = prev + k < n ? rank[prev + k] : -1;
                int c2 = cur + k < n ? rank[cur + k] : -1;
                same = p2 == c2;
            }
            tmp[sa[i]] = tmp[sa[i-1]] + (same ? 0 : 1);
        }
        rank = tmp;
        
        if (rank[sa[n-1]] == n-1) break;
    }
    
    return sa;
}

LCP Array

LCP (Longest Common Prefix) Array: lcp[i]lcp[i] = length of common prefix between suffix SA[i]SA[i] and SA[i1]SA[i-1]

Kasai’s Algorithm - O(n)

vector<int> buildLCP(const string& s, const vector<int>& sa) {
    int n = s.size();
    vector<int> rank(n), lcp(n);
    
    for (int i = 0; i < n; i++) rank[sa[i]] = i;
    
    int k = 0;
    for (int i = 0; i < n; i++) {
        if (rank[i] == 0) {
            k = 0;
            continue;
        }
        
        int j = sa[rank[i] - 1];
        while (i + k < n && j + k < n && s[i + k] == s[j + k]) {
            k++;
        }
        
        lcp[rank[i]] = k;
        if (k > 0) k--;
    }
    
    return lcp;
}

Applications

pair<int,int> findPattern(const string& text, const vector<int>& sa, const string& pattern) {
    int n = text.size(), m = pattern.size();
    
    // Find lower bound
    int lo = 0, hi = n;
    while (lo < hi) {
        int mid = (lo + hi) / 2;
        if (text.compare(sa[mid], m, pattern) < 0) lo = mid + 1;
        else hi = mid;
    }
    int left = lo;
    
    // Find upper bound
    hi = n;
    while (lo < hi) {
        int mid = (lo + hi) / 2;
        if (text.compare(sa[mid], m, pattern) <= 0) lo = mid + 1;
        else hi = mid;
    }
    int right = lo;
    
    return {left, right};  // matches at sa[left..right-1]
}

2. Count Distinct Substrings

long long countDistinctSubstrings(const string& s) {
    int n = s.size();
    vector<int> sa = suffixArray(s);
    vector<int> lcp = buildLCP(s, sa);
    
    // Total substrings = n*(n+1)/2
    // Subtract duplicates (sum of LCP)
    long long total = (long long)n * (n + 1) / 2;
    for (int i = 1; i < n; i++) {
        total -= lcp[i];
    }
    
    return total;
}

3. Longest Repeated Substring

string longestRepeatedSubstring(const string& s) {
    vector<int> sa = suffixArray(s);
    vector<int> lcp = buildLCP(s, sa);
    
    int maxLcp = 0, idx = 0;
    for (int i = 1; i < lcp.size(); i++) {
        if (lcp[i] > maxLcp) {
            maxLcp = lcp[i];
            idx = sa[i];
        }
    }
    
    return s.substr(idx, maxLcp);
}

4. Longest Common Substring of 2 Strings

string longestCommonSubstring(const string& a, const string& b) {
    string combined = a + "$" + b;
    int n = combined.size();
    int lenA = a.size();
    
    vector<int> sa = suffixArray(combined);
    vector<int> lcp = buildLCP(combined, sa);
    
    int maxLcp = 0, idx = 0;
    for (int i = 1; i < n; i++) {
        // Check if consecutive suffixes are from different strings
        bool first = sa[i-1] < lenA;
        bool second = sa[i] < lenA;
        
        if (first != second && lcp[i] > maxLcp) {
            maxLcp = lcp[i];
            idx = sa[i];
        }
    }
    
    return combined.substr(idx, maxLcp);
}

Complexity

OperationTimeSpace
Build SAO(nlogn)O(n \log n)O(n)O(n)
Build LCPO(n)O(n)O(n)O(n)
Pattern searchO(mlogn)O(m \log n)O(1)O(1)

Tips

  1. Use LCP array together with SA to solve many problems
  2. Counting sort speeds up SA building from O(nlog2n)O(n \log^2 n) to O(nlogn)O(n \log n)
  3. SA-IS Algorithm can build SA in O(n)O(n) but is harder to implement