ICPC Competitive Programming

Suffix Array

Suffix Array สำหรับปัญหา String ขั้นสูง

Suffix Array

Suffix Array เป็นโครงสร้างข้อมูลที่เก็บ suffixes ทั้งหมดของ string ในลำดับ lexicographically sorted

นิยาม

สำหรับ string ss ความยาว nn:

  • Suffix ii = substring s[i..n1]s[i..n-1]
  • Suffix Array SASA = array ของ indices ที่ทำให้ s[SA[0]]<s[SA[1]]<<s[SA[n1]]s[SA[0]] < s[SA[1]] < \ldots < s[SA[n-1]] (lexicographically)

ตัวอย่าง

String: "banana"

iSuffix
0banana
1anana
2nana
3ana
4na
5a

Sorted suffixes:

  1. a (index 5)
  2. ana (index 3)
  3. anana (index 1)
  4. banana (index 0)
  5. na (index 4)
  6. nana (index 2)

Suffix Array = [5, 3, 1, 0, 4, 2]

Construction Algorithm

O(n log^2 n) - Simple Version

vector<int> buildSuffixArray(const string& s) {
    int n = s.size();
    vector<int> sa(n), rank(n), tmp(n);
    
    // Initial ranking by first character
    for (int i = 0; i < n; i++) {
        sa[i] = i;
        rank[i] = s[i];
    }
    
    // Double the length each iteration
    for (int k = 1; k < n; k *= 2) {
        // Sort by (rank[i], rank[i+k])
        auto cmp = [&](int a, int b) {
            if (rank[a] != rank[b]) return rank[a] < rank[b];
            int ra = a + k < n ? rank[a + k] : -1;
            int rb = b + k < n ? rank[b + k] : -1;
            return ra < rb;
        };
        
        sort(sa.begin(), sa.end(), cmp);
        
        // Update ranks
        tmp[sa[0]] = 0;
        for (int i = 1; i < n; i++) {
            tmp[sa[i]] = tmp[sa[i-1]] + (cmp(sa[i-1], sa[i]) ? 1 : 0);
        }
        rank = tmp;
    }
    
    return sa;
}

O(n log n) - Optimized with Counting Sort

vector<int> suffixArray(const string& s) {
    int n = s.size();
    const int ALPHA = 256;
    
    vector<int> sa(n), rank(n), tmp(n), cnt(max(n, ALPHA));
    
    // Initial sort by first character
    for (int i = 0; i < n; i++) cnt[s[i]]++;
    for (int i = 1; i < ALPHA; i++) cnt[i] += cnt[i-1];
    for (int i = n-1; i >= 0; i--) sa[--cnt[s[i]]] = i;
    
    rank[sa[0]] = 0;
    for (int i = 1; i < n; i++) {
        rank[sa[i]] = rank[sa[i-1]] + (s[sa[i]] != s[sa[i-1]]);
    }
    
    for (int k = 1; k < n; k *= 2) {
        // Sort by second key (rank[i+k])
        fill(cnt.begin(), cnt.end(), 0);
        for (int i = 0; i < n; i++) {
            int key = i + k < n ? rank[i + k] + 1 : 0;
            cnt[key]++;
        }
        for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
        for (int i = n-1; i >= 0; i--) {
            int key = sa[i] + k < n ? rank[sa[i] + k] + 1 : 0;
            tmp[--cnt[key]] = sa[i];
        }
        
        // Sort by first key (rank[i])
        fill(cnt.begin(), cnt.end(), 0);
        for (int i = 0; i < n; i++) cnt[rank[i]]++;
        for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
        for (int i = n-1; i >= 0; i--) {
            sa[--cnt[rank[tmp[i]]]] = tmp[i];
        }
        
        // Update ranks
        tmp[sa[0]] = 0;
        for (int i = 1; i < n; i++) {
            int prev = sa[i-1], cur = sa[i];
            bool same = rank[prev] == rank[cur];
            if (same) {
                int p2 = prev + k < n ? rank[prev + k] : -1;
                int c2 = cur + k < n ? rank[cur + k] : -1;
                same = p2 == c2;
            }
            tmp[sa[i]] = tmp[sa[i-1]] + (same ? 0 : 1);
        }
        rank = tmp;
        
        if (rank[sa[n-1]] == n-1) break;
    }
    
    return sa;
}

LCP Array

LCP (Longest Common Prefix) Array: lcp[i]lcp[i] = ความยาว common prefix ระหว่าง suffix SA[i]SA[i] และ SA[i1]SA[i-1]

Kasai’s Algorithm - O(n)

vector<int> buildLCP(const string& s, const vector<int>& sa) {
    int n = s.size();
    vector<int> rank(n), lcp(n);
    
    for (int i = 0; i < n; i++) rank[sa[i]] = i;
    
    int k = 0;
    for (int i = 0; i < n; i++) {
        if (rank[i] == 0) {
            k = 0;
            continue;
        }
        
        int j = sa[rank[i] - 1];
        while (i + k < n && j + k < n && s[i + k] == s[j + k]) {
            k++;
        }
        
        lcp[rank[i]] = k;
        if (k > 0) k--;
    }
    
    return lcp;
}

ประยุกต์ใช้งาน

pair<int,int> findPattern(const string& text, const vector<int>& sa, const string& pattern) {
    int n = text.size(), m = pattern.size();
    
    // Find lower bound
    int lo = 0, hi = n;
    while (lo < hi) {
        int mid = (lo + hi) / 2;
        if (text.compare(sa[mid], m, pattern) < 0) lo = mid + 1;
        else hi = mid;
    }
    int left = lo;
    
    // Find upper bound
    hi = n;
    while (lo < hi) {
        int mid = (lo + hi) / 2;
        if (text.compare(sa[mid], m, pattern) <= 0) lo = mid + 1;
        else hi = mid;
    }
    int right = lo;
    
    return {left, right};  // matches at sa[left..right-1]
}

2. นับ Distinct Substrings

long long countDistinctSubstrings(const string& s) {
    int n = s.size();
    vector<int> sa = suffixArray(s);
    vector<int> lcp = buildLCP(s, sa);
    
    // Total substrings = n*(n+1)/2
    // Subtract duplicates (sum of LCP)
    long long total = (long long)n * (n + 1) / 2;
    for (int i = 1; i < n; i++) {
        total -= lcp[i];
    }
    
    return total;
}

3. Longest Repeated Substring

string longestRepeatedSubstring(const string& s) {
    vector<int> sa = suffixArray(s);
    vector<int> lcp = buildLCP(s, sa);
    
    int maxLcp = 0, idx = 0;
    for (int i = 1; i < lcp.size(); i++) {
        if (lcp[i] > maxLcp) {
            maxLcp = lcp[i];
            idx = sa[i];
        }
    }
    
    return s.substr(idx, maxLcp);
}

4. Longest Common Substring ของ 2 Strings

string longestCommonSubstring(const string& a, const string& b) {
    string combined = a + "$" + b;
    int n = combined.size();
    int lenA = a.size();
    
    vector<int> sa = suffixArray(combined);
    vector<int> lcp = buildLCP(combined, sa);
    
    int maxLcp = 0, idx = 0;
    for (int i = 1; i < n; i++) {
        // Check if consecutive suffixes are from different strings
        bool first = sa[i-1] < lenA;
        bool second = sa[i] < lenA;
        
        if (first != second && lcp[i] > maxLcp) {
            maxLcp = lcp[i];
            idx = sa[i];
        }
    }
    
    return combined.substr(idx, maxLcp);
}

Complexity

OperationTimeSpace
Build SAO(nlogn)O(n \log n)O(n)O(n)
Build LCPO(n)O(n)O(n)O(n)
Pattern searchO(mlogn)O(m \log n)O(1)O(1)

Tips

  1. ใช้ LCP array ร่วมกับ SA เพื่อแก้ปัญหาหลายอย่าง
  2. Counting sort ทำให้ build SA เร็วขึ้นจาก O(nlog2n)O(n \log^2 n) เป็น O(nlogn)O(n \log n)
  3. SA-IS Algorithm สามารถ build SA ใน O(n)O(n) แต่ implement ยากกว่า