字符串Hash(自然溢出、单Hash、双Hash)
更新(2018.11.29)
- 加入洛谷《字符串哈希》,借题目记录三种常用的Hash方法:自然溢出、单Hash、双Hash
- 加入 Codeforces 727E
- 更新前言部分
前言(2018.11.29)
字符串Hash常见的方法有 自然溢出, 单Hash 与 双Hash
自然溢出法不容易错,但是出题人想卡你可以卡
单Hash需要搞一个比较大的质数做模数,也有些容易被卡,如果质数小不专门卡都容易错,据说如果数据量在1e5而模数只有1e9量级的话有大概率会错(因为生日悖论问题),质数开大点比较保险
双Hash挺安全的,然而常数太大容易被卡超时
前言(原)
其实这个话题以前写过,但是省赛前某道题目让我发觉字符串的哈希真的是蛮重要的,因此重新提出来讲
- Rabin-Karp
https://www.geeksforgeeks.org/searching-for-patterns-set-3-rabin-karp-algorithm/
https://blog.csdn.net/m0_37846371/article/details/72854890
洛谷P3370 -【模板】字符串哈希
思路 —— 自然溢出
#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef unsigned long long ull;
const ull BASE = 12345;
const int N = 1e4 + 15;
char s[N];
ull a[N];
int main(){
int n;
while(~scanf("%d", &n)){
for(int i = 0; i < n; i++){
scanf("%s", s);
a[i] = 0;
for(int j = 0; s[j]; j++){
a[i] = a[i] * BASE + (ull)s[j];
}
}
sort(a, a + n);
int ans = 1;
for(int i = 1; i < n; i++){
if(a[i] != a[i - 1]){
ans++;
}
}
printf("%d\n", ans);
}
}
思路 —— 单Hash
#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef long long ll;
const ll BASE = 12345;
const ll MOD = 738832927927LL;
const int N = 1e4 + 15;
char s[N];
ll a[N];
int main(){
int n;
while(~scanf("%d", &n)){
for(int i = 0; i < n; i++){
scanf("%s", s);
a[i] = 0;
for(int j = 0; s[j]; j++){
a[i] = (a[i] * BASE + s[j]) % MOD;
}
}
sort(a, a + n);
int ans = 1;
for(int i = 1; i < n; i++){
if(a[i] != a[i - 1]){
ans++;
}
}
printf("%d\n", ans);
}
}
思路 —— 双Hash
#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef unsigned long long ull;
const ull BASE = 12345;
const ull MOD[2] = {(int)1e9 + 7, (int)1e9 + 7};
const int N = 1e4 + 15;
struct Node{
ull x, y;
bool operator < (const Node& b) const{
if(x != b.x) return x < b.x;
return y < b.y;
}
};
char s[N];
Node a[N];
int main(){
int n;
while(~scanf("%d", &n)){
for(int i = 0; i < n; i++){
scanf("%s", s);
a[i].x = a[i].y = 0;
for(int j = 0; s[j]; j++){
a[i].x = ((a[i].x * BASE) + s[j]) % MOD[0];
a[i].y = ((a[i].y * BASE) + s[j]) % MOD[1];
}
}
sort(a, a + n);
int ans = 1;
for(int i = 1; i < n; i++){
if(a[i].x != a[i - 1].x || a[i].y != a[i - 1].y){
ans++;
}
}
printf("%d\n", ans);
}
}
Crazy Search - POJ 1200
题意
给定一个字符串,其中含有NC个不同字符,求长度为N的不同子串的个数
思路
Rabin Karp模板题(这模板题有点难 = =)
首先利用NC转换子串,得到实现Hash的进制,然后用rolling hash(Rabin Karp)算个数
#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1e7 + 15;
char s[N];
bool used[N];
char mp[256];
set<int> st;
int rollingHash(char* s, int base, int m){
int ans = 0;
int n = strlen(s);
int t = 1;
for(int i = 0; i < m; i++) t = t * base;
int shash = 0;
for(int i = 0; i < m; i++) shash = shash*base + s[i] - '0';
for(int i = 0; i <= m + n; i++){
if(!used[shash]){
ans++;
used[shash] = true;
}
if(i + m < n) shash = shash*base - (s[i] - '0') * t + s[i + m] - '0';
}
return ans;
}
inline void init(){
memset(mp, 0, sizeof(mp));
memset(used, 0, sizeof(used));
}
int main(){
int m;
int base;
while(~scanf("%d%d%s", &m, &base, s)){
init();
char idx = '0';
for(int i = 0; s[i]; i++){
if(mp[(int)s[i]] == 0){
mp[(int)s[i]] = idx++;
}
s[i] = mp[(int)s[i]];
}
printf("%d\n", rollingHash(s, base, m));
}
}
Good Substrings - CodeForces 271D
题意
给定一字符串和一串01串,01串对应字母a-z,其中0为坏字母,1为好字母,求不超过k个坏字母的子串个数
思路1 —— hash + 数组
用sum统计坏字母数量前缀和,双重循环hash全部子串,如果sum区间超过k直接break,否则将相应hash值存入数组中,最后sort + unique,即可得不重复个数
#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1500 + 15;
const ull base = 27;
const ull MOD = 1e8 + 7;
ull arr[N*N];
char isgood[base];
int sum[N];
char s[N];
int tot;
inline void init(){
sum[0] = 0;
tot = 0;
}
int main(){
int lim;
while(~scanf("%s%s%d", s + 1, isgood, &lim)){
init();
int len = strlen(s + 1);
for(int i = 1; i <= len; i++){
sum[i] = sum[i - 1] + (isgood[s[i] - 'a'] == '0');
}
for(int i = 1; i <= len; i++){
ull shash = 0;
for(int j = i; j <= len; j++){
if(sum[j] - sum[i - 1] > lim) break;
shash = shash*MOD + (s[j] - 'a' + 1);
arr[tot++] = shash;
}
}
sort(arr, arr + tot);
printf("%d\n", (int)(unique(arr, arr + tot) - arr));
}
}
思路2 —— Trie树
仍然是用sum统计坏字母数量前缀和,双重循环插入Trie树,有新建结点说明是新子串,ans++,直接得答案
#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1500 + 15;
const int base = 26;
const int M = 2e6 + 7;
char isgood[base];
int sum[N];
char s[N];
int tot;
int nxt[M][base];
inline void init(){
memset(nxt, 0, sizeof(nxt));
sum[0] = 0;
tot = 1;
}
int main(){
int lim;
while(~scanf("%s%s%d", s + 1, isgood, &lim)){
init();
int len = strlen(s + 1);
for(int i = 1; i <= len; i++){
sum[i] = sum[i - 1] + (isgood[s[i] - 'a'] == '0');
}
int ans = 0;
for(int i = 1; i <= len; i++){
int rt = 0;
for(int j = i; j <= len; j++){
if(sum[j] - sum[i - 1] > lim) break;
if(nxt[rt][s[j] - 'a'] == 0){
nxt[rt][s[j] - 'a'] = tot++;
ans++;
}
rt = nxt[rt][s[j] - 'a'];
}
}
printf("%d\n", ans);
}
}
Censor - SCU 4438
题意
给定字符串w和p,循环删除p串中w串,每次删除后合并左右两串,问最后串的样子
思路
栈 + hash,具体看代码
#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 5e6 + 15;
const ull base = 1e8 + 7;
char p[N], s[N], stk[N];
ull shash[N];
int pstk;
inline void init(){
pstk = 0;
}
ull pow(ull a, ull b){
ull ans = 1, base = a;
while(b){
if(b&1LL) ans = ans * base;
b >>= 1LL;
base = base * base;
}
return ans;
}
int main(){
while(~scanf("%s%s", p, s)){
init();
int plen = strlen(p), slen = strlen(s);
ull t = pow(base, plen);
ull phash = 0;
for(int i = 0; i < plen; i++) phash = phash * base + p[i];
for(int i = 0; i < slen; i++){
stk[++pstk] = s[i];
if(pstk == 1) shash[pstk] = s[i];
else if(pstk <= plen) shash[pstk] = shash[pstk - 1] * base + s[i];
else shash[pstk] = shash[pstk - 1] * base - stk[pstk - plen] * t + s[i];
if(pstk >= plen && shash[pstk] == phash) pstk -= plen;
}
printf("%.*s\n", pstk, stk + 1);
}
}
Games on a CD - CodeForces 727E
题意
给定一个环形字符串,长度为n * k
,再给定g个长度为k
的字符串,问如何由这些字符串组成这个字符串z(每个字符串只能用一次),若不能组成输出NO
思路1 —— 双Hash
拿原串跑长度为k的双Hash,并记录在数组中,再拿g个字符串跑双Hash,记录在map中,最后对原串跳着尝试匹配即可
注意由于每个字符串只能用一次,因此需要开个set标记一下这一Hash值是否出现过
#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <map>
#include <set>
#include <vector>
using namespace std;
typedef long long ll;
const int N = 2e6 + 15;
const ll BASE = 233;
const ll MOD[2] = {19260817, 19660813};
struct pr{
ll x, y;
bool operator < (const pr& b) const{
return (x != b.x ? x < b.x : y < b.y);
}
bool operator == (const pr& b) const{
return x == b.x && y == b.y;
}
};
map<pr, int> mp;
set<pr> st;
vector<int> ans;
pr shash[N];
char s[N], p[N];
inline pr getHash(char s[], int n){
pr ret{0, 0};
for(int i = 0; i < n; i++){
ret.x = (ret.x * BASE + s[i]) % MOD[0];
ret.y = (ret.y * BASE + s[i]) % MOD[1];
}
return ret;
}
inline void initSHash(int n, int k){
pr t = {1, 1};
for(int i = 1; i <= k; i++){
t.x = (t.x * BASE) % MOD[0];
t.y = (t.y * BASE) % MOD[1];
}
shash[k - 1] = getHash(s, k);
for(int i = k; i < 2 * n * k; i++){
shash[i].x = ((shash[i - 1].x * BASE + s[i] - t.x * s[i - k]) % MOD[0] + MOD[0]) % MOD[0];
shash[i].y = ((shash[i - 1].y * BASE + s[i] - t.y * s[i - k]) % MOD[1] + MOD[1]) % MOD[1];
}
}
inline bool solve(int n, int k){
for(int i = k - 1; i < 2 * k - 1; i++){
bool flag = true;
ans.clear();
st.clear();
for(int j = 0, pos = i; j < n; j++, pos += k){
pr tmp = shash[pos];
if(mp.find(tmp) == mp.end() || st.count(tmp)){
flag = false;
break;
}else{
ans.push_back(mp[tmp]);
st.insert(tmp);
}
}
if(flag) return true;
}
return false;
}
int main(){
int n, k;
while(~scanf("%d%d", &n, &k)){
scanf("%s", s);
for(int i = 0; i <= n * k; i++){
s[i + n * k] = s[i];
}
initSHash(n, k);
int m;
scanf("%d", &m);
for(int i = 1; i <= m; i++){
scanf("%s", p);
mp[getHash(p, k)] = i;
}
if(!solve(n, k)){
puts("NO");
}else{
puts("YES");
bool tag = true;
for(int val : ans){
if(tag) tag = false;
else putchar(' ');
printf("%d", val);
}
puts("");
}
}
}
思路2 —— AC自动机
把g个字符串丢进AC自动机,然后用字符串跑,跑到结束的位置就标记一下,最后也是跳着匹配,同时也应注意标记一下哪些字符串是用过的
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <iostream>
using namespace std;
const int N = 2e6 + 15;
const int inf = 0x3f3f3f3f;
typedef long long ll;
bool used[N];
char s[N], p[N];
int nxt[N][26];
int fail[N];
int pos[N];
int f[N];
int tot;
inline void init(){
memset(nxt, -1, sizeof(nxt));
memset(fail, 0, sizeof(fail));
memset(pos, -1, sizeof(pos));
tot = 1;
}
void push(int pp, char* p){
int rt = 0;
for(int i = 0; p[i]; i++){
int idx = p[i] - 'a';
if(nxt[rt][idx] == -1){
nxt[rt][idx] = tot++;
}
rt = nxt[rt][idx];
}
pos[rt] = pp;
}
int que[N], head, tail;
void setFail(){
for(int j = 0; j < 26; j++){
if(nxt[0][j] == -1) nxt[0][j] = 0;
else que[tail++] = nxt[0][j];
}
while(head != tail){
int rt = que[head++];
for(int j = 0; j < 26; j++){
if(nxt[rt][j] != -1){
fail[nxt[rt][j]] = nxt[fail[rt]][j];
que[tail++] = nxt[rt][j];
}else{
nxt[rt][j] = nxt[fail[rt]][j];
}
}
}
}
void query(char* s, int k){
int rt = 0;
for(int i = 0; s[i]; i++){
int idx = s[i] - 'a';
rt = nxt[rt][idx];
f[i - k + 1] = pos[rt];
}
}
int main(){
int n, plen;
while(~scanf("%d%d", &n, &plen)){
init();
int slen = n*plen;
scanf("%s", s);
for(int i = 0; i < slen; i++){
s[i + slen] = s[i];
}
slen <<= 1;
int k;
scanf("%d", &k);
for(int i = 1; i <= k; i++){
scanf("%s", p);
push(i, p);
}
setFail();
query(s, plen);
int ansp = -1;
for(int i = 0; i < plen; i++){
int j, q;
for(j = i, q = 0; q < n; j += plen, q++){
if(f[j] == -1 || used[f[j]]) break;
used[f[j]] = true;
}
if(q == n){
ansp = i;
break;
}
for(; j >= i; j -= plen) used[f[j]] = false;
}
if(ansp == -1) puts("NO");
else{
puts("YES");
for(int i = ansp, q = 0; q < n; i += plen, q++){
printf("%d", f[i]);
if(q < n - 1) putchar(' ');
}
puts("");
}
}
return 0;
}