[toc]

一, 字典树

解决问题:

  1. 对于每个提问,给出以该字符串为前缀的单词的数量
  2. 字符串匹配, 通过字典翻译 (map很容易爆炸)

小心RE!!! 别忘清空内存

  • 插入操作时,静态建树中插入不存在的结点利用的是已经创建好的大数组进行存放,而动态数组则会动态申请一个结点;
  • 删除操作时,静态建树中删除只需将跟节点的next数组都置为空即可,而动态建树中则要递归删除已经分配的每一个结点;
  • 动态建树使用new费时,静态建树预存结点数据很费空间;

1. 动态建树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <cstdio>
#include <iostream>
#include <cstring>
using namespace std;

struct Trie {
int num;
Trie* next[26];
Trie() {
for (int i = 0; i <= 25; i++) next[i] = NULL;
num = 0;
}
};

Trie *root = new Trie;

void Insert(char* word) {
Trie* p = root;
for (int i = 0; word[i]; i++) {
if (p->next[word[i] - 'a'] == NULL) {
p->next[word[i] - 'a'] = new Trie;
}
p = p->next[word[i] - 'a'];
p->num++;
}
}

void Freedom(Trie *p) {
for (int i = 0; i < 26; i++) {
if (p -> next[i] != NULL)
Freedom(p -> next[i]);
}
delete p;
}

int Find(char* word) {
Trie* p = root;
for (int i = 0; word[i]; i++) {
if (p->next[word[i] - 'a'] == NULL) {
return 0;
}
p = p->next[word[i] - 'a'];
}
return p->num;
}

int main() {
char word[11];
int n, m;
while (cin.getline(word, 12)) {
if (strlen(word) == 0) {
break;
}
Insert(word);
}
while (scanf("%s", word) != EOF) {
printf("%d\n", Find(word));
}
Freedom(root);
return 0;
}

2. 静态建树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81



#include <bits/stdc++.h>
using namespace std;
const int maxn = 2e6 + 5;

int tree[maxn][30];
int flagg[maxn];

int tot;

void Insert(char *word) {
int len = strlen(word);
int root = 0;
for (int i = 0; i < len; i++) {
int id = word[i] - '0';
if (!tree[root][id])
tree[root][id] = ++tot;
root = tree[root][id];
flagg[root]++;
}
//flagg[root] += 1;
}

int Find(char* word) {
int len = strlen(word);
int root = 0;
int ans = 0;
for (int i = 0; i < len; i++) {
int id = word[i] - '0';
if (!tree[root][id])
return 0;
//if (flagg[tree[root][id]])
//ans += 1;
root = tree[root][id];
}
ans = flagg[root];
return ans;
}

void init() {
for (int i = 0; i <= tot; i++) {
flagg[i] = false;
for (int j = 0; j < 30; j++) {
tree[i][j] = 0;
}
}
tot = 0;
}

int main() {
char word[11];
int n, m;
init();
while (cin.getline(word, 12)) {
if (strlen(word) == 0) {
break;
}
Insert(word);
}
while (scanf("%s", word) != EOF) {
printf("%d\n", Find(word));
}
return 0;
}

/*

banana
band
bee
absolute
acm

ba
b
band
abcf

*/

二, AC自动机

问题描述: 求目标串中出现了几个模式串

AC自动机, 字典树 与, KMP 对比

KMP算法是单模式串的字符匹配算法, 一对一

EX-KMP求母串的每个后缀与子串的最长公共前缀长度

AC自动机可以判断, 多个串是否是一个串的子串, 多对一

字典树可以判断, 某一个串是否是一堆串中那些串的前缀, 一对多

1
2
3
4
5
6
7
8
9
10
11
/*
5
she
he
say
shr
her
yasherhs
*/

//输出3, "she", "he", "her", 满足条件, 是"yasherhs"的子串
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <stdio.h>
#include <algorithm>
#include <iostream>
#include <string.h>
#include <queue>
using namespace std;

struct Trie
{
int next[500010][26],fail[500010],end[500010];
int root,L;
int newnode()
{
for(int i = 0;i < 26;i++)
next[L][i] = -1;
end[L++] = 0;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[])
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]-'a'] == -1)
next[now][buf[i]-'a'] = newnode();
now = next[now][buf[i]-'a'];
}
end[now]++;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < 26;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < 26;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int query(char buf[])
{
int len = strlen(buf);
int now = root;
int res = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]-'a'];
int temp = now;
while( temp != root )
{
res += end[temp];
end[temp] = 0;
temp = fail[temp];
}
}
return res;
}
void debug()
{
for(int i = 0;i < L;i++)
{
printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],end[i]);
for(int j = 0;j < 26;j++)
printf("%2d",next[i][j]);
printf("]\n");
}
}
};
char buf[1000010];
Trie ac;
int main()
{
int T;
int n;
scanf("%d",&T);
while( T-- )
{
scanf("%d",&n);
ac.init();
for(int i = 0;i < n;i++)
{
scanf("%s",buf);
ac.insert(buf);
}
ac.build();
scanf("%s",buf);
printf("%d\n",ac.query(buf));
}
return 0;
}

三, KMP && EXKMP

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
int KMP(char *s,char *p)
{
int i=0;
int j=0;
int lens=strlen(s);
int lenp=strlen(p);
while(i<lens&&j<lenp)
{
//如果j=-1(表示第一次开始或者重新开始匹配),即失配于首字符
if(j==-1||s[i]==p[j])
{
j++;
i++;
}
else
{
//如果j!+-1,或者当前匹配失败 ,则
j=nextt[j]; // i不变,j=next[j]
}
}
if(j==lenp)
return 1;//匹配成功
else
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
//扩展KMP可求问题:给出一个长为N的字符串S,再给出一个长为M的字符串T
//求S的所有后缀中和T的最长公共前缀
//假设s为文本串(长度为n),T为模式串(长度为m)
char s[N], T[N];
int n, m;
int Next[N]; //Next[i]代表T[i~m-1]与T[0~m-1]最长公共前缀
int ex[N]; //extend数组,代表s[i~n-1]与T[0~m-1]最长公共前缀
//注意到,如果有一个位置extend[i]=m,则表示T在S中出现,而且是在位置i出现,这就是标准的KMP问题
void Get_Next() {
Next[0] = m;
int j = 0;
while (j + 1 < m && T[j] == T[j + 1]) //计算Next[1]
j++;
Next[1] = j
int po = 1; //设po为答案伸得最远的后缀的下标
for (int i = 2; i < m; i++)
int p = Next[po] + po - 1; //已匹配最大位置
int L = Next[i - po]; //前缀长度
if (i + L < p + 1)
Next[i] = L;
else {
j = max(0, p - i + 1);
while (i + j < m && T[i + j] == T[j])
j++;
Next[i] = j;
po = i;
}
}
}

void EX_KMP() {
Get_Next();
int j = 0;
while (j < n && j < m && s[j] == T[j])
j++;
ex[0] = j;
int po = 0;
for (int i = 1; i < n; i++) {
int p = ex[po] + po - 1;
int L = Next[i - po];
if (i + L < p + 1)
ex[i] = L;
else {
j = max(0, p - i + 1);
while (i + j < n && j < m && s[i + j] == T[j])
j++;
ex[i] = j;
po = i;
}
}
}

四, HASH哈希

单hash

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#define base 233
#define inf 1<<30
ull mod = inf;
//定义一个大数(最好是质数)作为模数,这里用的是1<<30
//定义一个base进制,这里是233
il ull hash(char s[]){
ll ans = 0;
ll len = strlen(s);
for(ll i = 0; i < len; i++){
ans = (base * ans + (ull)s[i]) % mod;
}
return ans;
//枚举该字符串的每一位,与base相乘,转化为base进制,加(ull)是为了防止爆栈搞出一个负数,(ull)是无符号的,但其实加了一个ull是可以不用mod的,加个mod更保险
//然而加了mod会很玄学,莫名比不加mod慢了300多ms
}

双hash

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

#include <bits/stdc++.h>
#define ll int
#define inf 1<<30
#define mt(x,y) memset(x,y,sizeof(x))
#define ull unsigned long long
using namespace std;
#define N 10001
#define base 233
ull mod1 = 212370440130137957ll;
ull mod2 = inf;

ll n;
char a[N];
struct node { ll x, y; }f[N];

inline ull hash1(char s[]) {
ll ans = 0, len = strlen(s);
for (ll i = 0; i < len; i++) {
ans = (base * ans + (ull)s[i]) % mod1;
}
return ans;
}

inline ull hash2(char s[]) {
ll ans = 0, len = strlen(s);
for (ll i = 0; i < len; i++) {
ans = (base * ans + (ull)s[i]) % mod2;
}
return ans;
}

bool cmp1(node a, node b) { return a.x < b.x; }
bool cmp2(node a, node b) { return a.y < b.y; }

int main() {
cin >> n;
for (ll i = 1; i <= n; i++) {
scanf("%s", a);
f[i].x = hash1(a);
f[i].y = hash2(a);
}
sort(f + 1, f + n + 1, cmp1); sort(f + 1, f + n + 1, cmp2);
ll ans = 1;
for (ll i = 1; i < n; i++) {
if (f[i].x != f[i + 1].x || f[i].y != f[i + 1].y)ans++;
}
printf("%d\n", ans);
return 0;
}