Suffix Array
Suffix Array สำหรับปัญหา String ขั้นสูง
Suffix Array
Suffix Array เป็นโครงสร้างข้อมูลที่เก็บ suffixes ทั้งหมดของ string ในลำดับ lexicographically sorted
นิยาม
สำหรับ string ความยาว :
- Suffix = substring
- Suffix Array = array ของ indices ที่ทำให้ (lexicographically)
ตัวอย่าง
String: "banana"
| i | Suffix |
|---|---|
| 0 | banana |
| 1 | anana |
| 2 | nana |
| 3 | ana |
| 4 | na |
| 5 | a |
Sorted suffixes:
- a (index 5)
- ana (index 3)
- anana (index 1)
- banana (index 0)
- na (index 4)
- nana (index 2)
Suffix Array = [5, 3, 1, 0, 4, 2]
Construction Algorithm
O(n log^2 n) - Simple Version
vector<int> buildSuffixArray(const string& s) {
int n = s.size();
vector<int> sa(n), rank(n), tmp(n);
// Initial ranking by first character
for (int i = 0; i < n; i++) {
sa[i] = i;
rank[i] = s[i];
}
// Double the length each iteration
for (int k = 1; k < n; k *= 2) {
// Sort by (rank[i], rank[i+k])
auto cmp = [&](int a, int b) {
if (rank[a] != rank[b]) return rank[a] < rank[b];
int ra = a + k < n ? rank[a + k] : -1;
int rb = b + k < n ? rank[b + k] : -1;
return ra < rb;
};
sort(sa.begin(), sa.end(), cmp);
// Update ranks
tmp[sa[0]] = 0;
for (int i = 1; i < n; i++) {
tmp[sa[i]] = tmp[sa[i-1]] + (cmp(sa[i-1], sa[i]) ? 1 : 0);
}
rank = tmp;
}
return sa;
}
O(n log n) - Optimized with Counting Sort
vector<int> suffixArray(const string& s) {
int n = s.size();
const int ALPHA = 256;
vector<int> sa(n), rank(n), tmp(n), cnt(max(n, ALPHA));
// Initial sort by first character
for (int i = 0; i < n; i++) cnt[s[i]]++;
for (int i = 1; i < ALPHA; i++) cnt[i] += cnt[i-1];
for (int i = n-1; i >= 0; i--) sa[--cnt[s[i]]] = i;
rank[sa[0]] = 0;
for (int i = 1; i < n; i++) {
rank[sa[i]] = rank[sa[i-1]] + (s[sa[i]] != s[sa[i-1]]);
}
for (int k = 1; k < n; k *= 2) {
// Sort by second key (rank[i+k])
fill(cnt.begin(), cnt.end(), 0);
for (int i = 0; i < n; i++) {
int key = i + k < n ? rank[i + k] + 1 : 0;
cnt[key]++;
}
for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
for (int i = n-1; i >= 0; i--) {
int key = sa[i] + k < n ? rank[sa[i] + k] + 1 : 0;
tmp[--cnt[key]] = sa[i];
}
// Sort by first key (rank[i])
fill(cnt.begin(), cnt.end(), 0);
for (int i = 0; i < n; i++) cnt[rank[i]]++;
for (int i = 1; i < n; i++) cnt[i] += cnt[i-1];
for (int i = n-1; i >= 0; i--) {
sa[--cnt[rank[tmp[i]]]] = tmp[i];
}
// Update ranks
tmp[sa[0]] = 0;
for (int i = 1; i < n; i++) {
int prev = sa[i-1], cur = sa[i];
bool same = rank[prev] == rank[cur];
if (same) {
int p2 = prev + k < n ? rank[prev + k] : -1;
int c2 = cur + k < n ? rank[cur + k] : -1;
same = p2 == c2;
}
tmp[sa[i]] = tmp[sa[i-1]] + (same ? 0 : 1);
}
rank = tmp;
if (rank[sa[n-1]] == n-1) break;
}
return sa;
}
LCP Array
LCP (Longest Common Prefix) Array: = ความยาว common prefix ระหว่าง suffix และ
Kasai’s Algorithm - O(n)
vector<int> buildLCP(const string& s, const vector<int>& sa) {
int n = s.size();
vector<int> rank(n), lcp(n);
for (int i = 0; i < n; i++) rank[sa[i]] = i;
int k = 0;
for (int i = 0; i < n; i++) {
if (rank[i] == 0) {
k = 0;
continue;
}
int j = sa[rank[i] - 1];
while (i + k < n && j + k < n && s[i + k] == s[j + k]) {
k++;
}
lcp[rank[i]] = k;
if (k > 0) k--;
}
return lcp;
}
ประยุกต์ใช้งาน
1. Pattern Matching (Binary Search)
pair<int,int> findPattern(const string& text, const vector<int>& sa, const string& pattern) {
int n = text.size(), m = pattern.size();
// Find lower bound
int lo = 0, hi = n;
while (lo < hi) {
int mid = (lo + hi) / 2;
if (text.compare(sa[mid], m, pattern) < 0) lo = mid + 1;
else hi = mid;
}
int left = lo;
// Find upper bound
hi = n;
while (lo < hi) {
int mid = (lo + hi) / 2;
if (text.compare(sa[mid], m, pattern) <= 0) lo = mid + 1;
else hi = mid;
}
int right = lo;
return {left, right}; // matches at sa[left..right-1]
}
2. นับ Distinct Substrings
long long countDistinctSubstrings(const string& s) {
int n = s.size();
vector<int> sa = suffixArray(s);
vector<int> lcp = buildLCP(s, sa);
// Total substrings = n*(n+1)/2
// Subtract duplicates (sum of LCP)
long long total = (long long)n * (n + 1) / 2;
for (int i = 1; i < n; i++) {
total -= lcp[i];
}
return total;
}
3. Longest Repeated Substring
string longestRepeatedSubstring(const string& s) {
vector<int> sa = suffixArray(s);
vector<int> lcp = buildLCP(s, sa);
int maxLcp = 0, idx = 0;
for (int i = 1; i < lcp.size(); i++) {
if (lcp[i] > maxLcp) {
maxLcp = lcp[i];
idx = sa[i];
}
}
return s.substr(idx, maxLcp);
}
4. Longest Common Substring ของ 2 Strings
string longestCommonSubstring(const string& a, const string& b) {
string combined = a + "$" + b;
int n = combined.size();
int lenA = a.size();
vector<int> sa = suffixArray(combined);
vector<int> lcp = buildLCP(combined, sa);
int maxLcp = 0, idx = 0;
for (int i = 1; i < n; i++) {
// Check if consecutive suffixes are from different strings
bool first = sa[i-1] < lenA;
bool second = sa[i] < lenA;
if (first != second && lcp[i] > maxLcp) {
maxLcp = lcp[i];
idx = sa[i];
}
}
return combined.substr(idx, maxLcp);
}
Complexity
| Operation | Time | Space |
|---|---|---|
| Build SA | ||
| Build LCP | ||
| Pattern search |
Tips
- ใช้ LCP array ร่วมกับ SA เพื่อแก้ปัญหาหลายอย่าง
- Counting sort ทำให้ build SA เร็วขึ้นจาก เป็น
- SA-IS Algorithm สามารถ build SA ใน แต่ implement ยากกว่า