forked from williamfiset/Algorithms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LongestRepeatedSubstring.java
133 lines (113 loc) · 3.88 KB
/
LongestRepeatedSubstring.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
* Finds the longest repeated substring(s) of a string.
*
* <p>Time complexity: O(nlogn), bounded by suffix array construction
*
* @author William Fiset, william.alexandre.fiset@gmail.com
*/
package com.williamfiset.algorithms.strings;
import java.util.*;
public class LongestRepeatedSubstring {
// Example usage
public static void main(String[] args) {
String str = "ABC$BCA$CAB";
SuffixArray sa = new SuffixArray(str);
System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());
str = "aaaaa";
sa = new SuffixArray(str);
System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());
str = "abcde";
sa = new SuffixArray(str);
System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());
}
public static class SuffixArray {
// ALPHABET_SZ is the default alphabet size, this may need to be much larger
int ALPHABET_SZ = 256, N;
int[] T, lcp, sa, sa2, rank, tmp, c;
public SuffixArray(String str) {
this(toIntArray(str));
}
private static int[] toIntArray(String s) {
int[] text = new int[s.length()];
for (int i = 0; i < s.length(); i++) text[i] = s.charAt(i);
return text;
}
// Designated constructor
public SuffixArray(int[] text) {
T = text;
N = text.length;
sa = new int[N];
sa2 = new int[N];
rank = new int[N];
c = new int[Math.max(ALPHABET_SZ, N)];
construct();
kasai();
}
private void construct() {
int i, p, r;
for (i = 0; i < N; ++i) c[rank[i] = T[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[T[i]]] = i;
for (p = 1; p < N; p <<= 1) {
for (r = 0, i = N - p; i < N; ++i) sa2[r++] = i;
for (i = 0; i < N; ++i) if (sa[i] >= p) sa2[r++] = sa[i] - p;
Arrays.fill(c, 0, ALPHABET_SZ, 0);
for (i = 0; i < N; ++i) c[rank[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[rank[sa2[i]]]] = sa2[i];
for (sa2[sa[0]] = r = 0, i = 1; i < N; ++i) {
if (!(rank[sa[i - 1]] == rank[sa[i]]
&& sa[i - 1] + p < N
&& sa[i] + p < N
&& rank[sa[i - 1] + p] == rank[sa[i] + p])) r++;
sa2[sa[i]] = r;
}
tmp = rank;
rank = sa2;
sa2 = tmp;
if (r == N - 1) break;
ALPHABET_SZ = r + 1;
}
}
// Use Kasai algorithm to build LCP array
private void kasai() {
lcp = new int[N];
int[] inv = new int[N];
for (int i = 0; i < N; i++) inv[sa[i]] = i;
for (int i = 0, len = 0; i < N; i++) {
if (inv[i] > 0) {
int k = sa[inv[i] - 1];
while ((i + len < N) && (k + len < N) && T[i + len] == T[k + len]) len++;
lcp[inv[i] - 1] = len;
if (len > 0) len--;
}
}
}
// Finds the LRS(s) (Longest Repeated Substring) that occurs in a string.
// Traditionally we are only interested in substrings that appear at
// least twice, so this method returns an empty set if this is not the case.
// @return an ordered set of longest repeated substrings
public TreeSet<String> lrs() {
int max_len = 0;
TreeSet<String> lrss = new TreeSet<>();
for (int i = 0; i < N; i++) {
if (lcp[i] > 0 && lcp[i] >= max_len) {
// We found a longer LRS
if (lcp[i] > max_len) lrss.clear();
// Append substring to the list and update max
max_len = lcp[i];
lrss.add(new String(T, sa[i], max_len));
}
}
return lrss;
}
public void display() {
System.out.printf("-----i-----SA-----LCP---Suffix\n");
for (int i = 0; i < N; i++) {
int suffixLen = N - sa[i];
String suffix = new String(T, sa[i], suffixLen);
System.out.printf("% 7d % 7d % 7d %s\n", i, sa[i], lcp[i], suffix);
}
}
}
}