字符串Hash(自然溢出、单Hash、双Hash)



更新(2018.11.29)

  1. 加入洛谷《字符串哈希》,借题目记录三种常用的Hash方法:自然溢出、单Hash、双Hash
  2. 加入 Codeforces 727E
  3. 更新前言部分

前言(2018.11.29)

字符串Hash常见的方法有 自然溢出单Hash双Hash
自然溢出法不容易错,但是出题人想卡你可以卡
单Hash需要搞一个比较大的质数做模数,也有些容易被卡,如果质数小不专门卡都容易错,据说如果数据量在1e5而模数只有1e9量级的话有大概率会错(因为生日悖论问题),质数开大点比较保险
双Hash挺安全的,然而常数太大容易被卡超时

前言(原)

其实这个话题以前写过,但是省赛前某道题目让我发觉字符串的哈希真的是蛮重要的,因此重新提出来讲

  1. Rabin-Karp
    https://www.geeksforgeeks.org/searching-for-patterns-set-3-rabin-karp-algorithm/
    https://blog.csdn.net/m0_37846371/article/details/72854890

洛谷P3370 -【模板】字符串哈希


思路 —— 自然溢出

#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef unsigned long long ull;
const ull BASE = 12345;
const int N = 1e4 + 15;

char s[N];
ull a[N];

int main(){
    int n;
    while(~scanf("%d", &n)){
        for(int i = 0; i < n; i++){
            scanf("%s", s);
            a[i] = 0;
            for(int j = 0; s[j]; j++){
                a[i] = a[i] * BASE + (ull)s[j];
            }
        }
        sort(a, a + n);

        int ans = 1;
        for(int i = 1; i < n; i++){
            if(a[i] != a[i - 1]){
                ans++;
            }
        }
        printf("%d\n", ans);
    }
}

思路 —— 单Hash

#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef long long ll;
const ll BASE = 12345;
const ll MOD = 738832927927LL;
const int N = 1e4 + 15;

char s[N];
ll a[N];

int main(){
    int n;
    while(~scanf("%d", &n)){
        for(int i = 0; i < n; i++){
            scanf("%s", s);
            a[i] = 0;
            for(int j = 0; s[j]; j++){
                a[i] = (a[i] * BASE + s[j]) % MOD;
            }
        }
        sort(a, a + n);

        int ans = 1;
        for(int i = 1; i < n; i++){
            if(a[i] != a[i - 1]){
                ans++;
            }
        }
        printf("%d\n", ans);
    }
}

思路 —— 双Hash

#include <cstdio>
#include <algorithm>
#include <functional>
using namespace std;
typedef unsigned long long ull;
const ull BASE = 12345;
const ull MOD[2] = {(int)1e9 + 7, (int)1e9 + 7};
const int N = 1e4 + 15;

struct Node{
    ull x, y;
    bool operator < (const Node& b) const{
        if(x != b.x)    return x < b.x;
        return y < b.y;
    }
};

char s[N];
Node a[N];

int main(){
    int n;
    while(~scanf("%d", &n)){
        for(int i = 0; i < n; i++){
            scanf("%s", s);
            a[i].x = a[i].y = 0;
            for(int j = 0; s[j]; j++){
                a[i].x = ((a[i].x * BASE) + s[j]) % MOD[0];
                a[i].y = ((a[i].y * BASE) + s[j]) % MOD[1];
            }
        }
        sort(a, a + n);

        int ans = 1;
        for(int i = 1; i < n; i++){
            if(a[i].x != a[i - 1].x || a[i].y != a[i - 1].y){
                ans++;
            }
        }
        printf("%d\n", ans);
    }
}


Crazy Search - POJ 1200

题意
给定一个字符串,其中含有NC个不同字符,求长度为N的不同子串的个数
思路
Rabin Karp模板题(这模板题有点难 = =)
首先利用NC转换子串,得到实现Hash的进制,然后用rolling hash(Rabin Karp)算个数

#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1e7 + 15;

char s[N];
bool used[N];
char mp[256];
set<int> st;

int rollingHash(char* s, int base, int m){
    int ans = 0;
    int n = strlen(s);

    int t = 1;
    for(int i = 0; i < m; i++)  t = t * base;

    int shash = 0;
    for(int i = 0; i < m; i++)  shash = shash*base + s[i] - '0';

    for(int i = 0; i <= m + n; i++){
        if(!used[shash]){
            ans++;
            used[shash] = true;
        }
        if(i + m < n)       shash = shash*base - (s[i] - '0') * t + s[i + m] - '0';
    }
    return ans;
}

inline void init(){
    memset(mp, 0, sizeof(mp));
    memset(used, 0, sizeof(used));
}

int main(){
    int m;
    int base;
    while(~scanf("%d%d%s", &m, &base, s)){
        init();
        char idx = '0';
        for(int i = 0; s[i]; i++){
            if(mp[(int)s[i]] == 0){
                mp[(int)s[i]] = idx++;
            }
            s[i] = mp[(int)s[i]];
        }
        printf("%d\n", rollingHash(s, base, m));
    }
}


Good Substrings - CodeForces 271D

题意
给定一字符串和一串01串,01串对应字母a-z,其中0为坏字母,1为好字母,求不超过k个坏字母的子串个数
思路1 —— hash + 数组
用sum统计坏字母数量前缀和,双重循环hash全部子串,如果sum区间超过k直接break,否则将相应hash值存入数组中,最后sort + unique,即可得不重复个数

#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1500 + 15;
const ull base = 27;
const ull MOD = 1e8 + 7;

ull  arr[N*N];
char isgood[base];
int  sum[N];
char s[N];
int  tot;

inline void init(){
    sum[0] = 0;
    tot = 0;
}

int main(){
    int lim;
    while(~scanf("%s%s%d", s + 1, isgood, &lim)){
        init();
        int len = strlen(s + 1);
        for(int i = 1; i <= len; i++){
            sum[i] = sum[i - 1] + (isgood[s[i] - 'a'] == '0');
        }

        for(int i = 1; i <= len; i++){
            ull shash = 0;
            for(int j = i; j <= len; j++){
                if(sum[j] - sum[i - 1] > lim)   break;
                shash = shash*MOD + (s[j] - 'a' + 1);
                arr[tot++] = shash;
            }
        }
        sort(arr, arr + tot);
        printf("%d\n", (int)(unique(arr, arr + tot) - arr));
    }
}

思路2 —— Trie树
仍然是用sum统计坏字母数量前缀和,双重循环插入Trie树,有新建结点说明是新子串,ans++,直接得答案

#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 1500 + 15;
const int base = 26;
const int M = 2e6 + 7;

char isgood[base];
int  sum[N];
char s[N];
int  tot;
int  nxt[M][base];

inline void init(){
    memset(nxt, 0, sizeof(nxt));
    sum[0] = 0;
    tot = 1;
}

int main(){
    int lim;
    while(~scanf("%s%s%d", s + 1, isgood, &lim)){
        init();
        int len = strlen(s + 1);
        for(int i = 1; i <= len; i++){
            sum[i] = sum[i - 1] + (isgood[s[i] - 'a'] == '0');
        }

        int ans = 0;
        for(int i = 1; i <= len; i++){
            int rt = 0;
            for(int j = i; j <= len; j++){
                if(sum[j] - sum[i - 1] > lim)   break;
                if(nxt[rt][s[j] - 'a'] == 0){
                    nxt[rt][s[j] - 'a'] = tot++;
                    ans++;
                }
                rt = nxt[rt][s[j] - 'a'];
            }
        }
        printf("%d\n", ans);
    }
}


Censor - SCU 4438

题意
给定字符串w和p,循环删除p串中w串,每次删除后合并左右两串,问最后串的样子
思路
栈 + hash,具体看代码

#include <cstdio>
#include <algorithm>
#include <cstring>
#include <set>
using namespace std;
typedef unsigned long long ull;
const int N = 5e6 + 15;
const ull base = 1e8 + 7;

char p[N], s[N], stk[N];
ull  shash[N];
int  pstk;

inline void init(){
    pstk = 0;
}

ull pow(ull a, ull b){
    ull ans = 1, base = a;
    while(b){
        if(b&1LL)   ans = ans * base;
        b >>= 1LL;
        base = base * base;
    }
    return ans;
}

int main(){
    while(~scanf("%s%s", p, s)){
        init();
        int plen = strlen(p), slen = strlen(s);

        ull t = pow(base, plen);
        ull phash = 0;
        for(int i = 0; i < plen; i++)   phash = phash * base + p[i];

        for(int i = 0; i < slen; i++){
            stk[++pstk] = s[i];
            if(pstk == 1)            shash[pstk] = s[i];
            else if(pstk <= plen)    shash[pstk] = shash[pstk - 1] * base + s[i];
            else                     shash[pstk] = shash[pstk - 1] * base - stk[pstk - plen] * t + s[i];

            if(pstk >= plen && shash[pstk] == phash)    pstk -= plen;
        }

        printf("%.*s\n", pstk, stk + 1);
    }
}


Games on a CD - CodeForces 727E


题意
给定一个环形字符串,长度为n * k,再给定g个长度为k的字符串,问如何由这些字符串组成这个字符串z(每个字符串只能用一次),若不能组成输出NO
思路1 —— 双Hash
拿原串跑长度为k的双Hash,并记录在数组中,再拿g个字符串跑双Hash,记录在map中,最后对原串跳着尝试匹配即可
注意由于每个字符串只能用一次,因此需要开个set标记一下这一Hash值是否出现过

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <map>
#include <set>
#include <vector>
using namespace std;
typedef long long ll;
const int N = 2e6 + 15;
const ll BASE = 233;
const ll MOD[2] = {19260817, 19660813};

struct pr{
    ll x, y;
    bool operator < (const pr& b)  const{
        return (x != b.x ? x < b.x : y < b.y);
    }
    bool operator == (const pr& b)  const{
        return x == b.x && y == b.y;
    }
};
map<pr, int> mp;
set<pr> st;
vector<int> ans;
pr shash[N];
char s[N], p[N];

inline pr getHash(char s[], int n){
    pr ret{0, 0};
    for(int i = 0; i < n; i++){
        ret.x = (ret.x * BASE + s[i]) % MOD[0];
        ret.y = (ret.y * BASE + s[i]) % MOD[1];
    }
    return ret;
}

inline void initSHash(int n, int k){
    pr t = {1, 1};
    for(int i = 1; i <= k; i++){
        t.x = (t.x * BASE) % MOD[0];
        t.y = (t.y * BASE) % MOD[1];
    }

    shash[k - 1] = getHash(s, k);
    for(int i = k; i < 2 * n * k; i++){
        shash[i].x = ((shash[i - 1].x * BASE + s[i] - t.x * s[i - k]) % MOD[0] + MOD[0]) % MOD[0];
        shash[i].y = ((shash[i - 1].y * BASE + s[i] - t.y * s[i - k]) % MOD[1] + MOD[1]) % MOD[1];
    }
}

inline bool solve(int n, int k){
    for(int i = k - 1; i < 2 * k - 1; i++){
        bool flag = true;
        ans.clear();
        st.clear();
        for(int j = 0, pos = i; j < n; j++, pos += k){
            pr tmp = shash[pos];
            if(mp.find(tmp) == mp.end() || st.count(tmp)){
                flag = false;
                break;
            }else{
                ans.push_back(mp[tmp]);
                st.insert(tmp);
            }
        }
        if(flag)    return true;
    }
    return false;
}

int main(){
    int n, k;
    while(~scanf("%d%d", &n, &k)){
        scanf("%s", s);
        for(int i = 0; i <= n * k; i++){
            s[i + n * k] = s[i];
        }
        initSHash(n, k);

        int m;
        scanf("%d", &m);
        for(int i = 1; i <= m; i++){
            scanf("%s", p);
            mp[getHash(p, k)] = i;
        }
        if(!solve(n, k)){
            puts("NO");
        }else{
            puts("YES");
            bool tag = true;
            for(int val : ans){
                if(tag)     tag = false;
                else        putchar(' ');
                printf("%d", val);
            }
            puts("");
        }
    }
}

思路2 —— AC自动机
把g个字符串丢进AC自动机,然后用字符串跑,跑到结束的位置就标记一下,最后也是跳着匹配,同时也应注意标记一下哪些字符串是用过的

#include <cstdio>
#include <cstring>
#include <algorithm>
#include <iostream>
using namespace std;
const int N = 2e6 + 15;
const int inf = 0x3f3f3f3f;
typedef long long ll;

bool used[N];
char s[N], p[N];
int nxt[N][26];
int fail[N];
int pos[N];
int f[N];
int tot;

inline void init(){
    memset(nxt, -1, sizeof(nxt));
    memset(fail, 0, sizeof(fail));
    memset(pos, -1, sizeof(pos));
    tot = 1;
}

void push(int pp, char* p){
    int rt = 0;
    for(int i = 0; p[i]; i++){
        int idx = p[i] - 'a';
        if(nxt[rt][idx] == -1){
            nxt[rt][idx] = tot++;
        }
        rt = nxt[rt][idx];
    }
    pos[rt] = pp;
}

int que[N], head, tail;
void setFail(){
    for(int j = 0; j < 26; j++){
        if(nxt[0][j] == -1)     nxt[0][j] = 0;
        else                    que[tail++] = nxt[0][j];
    }
    while(head != tail){
        int rt = que[head++];
        for(int j = 0; j < 26; j++){
            if(nxt[rt][j] != -1){
                fail[nxt[rt][j]] = nxt[fail[rt]][j];
                que[tail++] = nxt[rt][j];
            }else{
                nxt[rt][j] = nxt[fail[rt]][j];
            }
        }
    }
}

void query(char* s, int k){
    int rt = 0;
    for(int i = 0; s[i]; i++){
        int idx = s[i] - 'a';
        rt = nxt[rt][idx];
        f[i - k + 1] = pos[rt];
    }
}

int main(){
    int n, plen;
    while(~scanf("%d%d", &n, &plen)){
        init();
        int slen = n*plen;
        scanf("%s", s);
        for(int i = 0; i < slen; i++){
            s[i + slen] = s[i];
        }
        slen <<= 1;

        int k;
        scanf("%d", &k);
        for(int i = 1; i <= k; i++){
            scanf("%s", p);
            push(i, p);
        }
        setFail();
        query(s, plen);

        int ansp = -1;
        for(int i = 0; i < plen; i++){
            int j, q;
            for(j = i, q = 0; q < n; j += plen, q++){
                if(f[j] == -1 || used[f[j]])    break;
                used[f[j]] = true;
            }
            if(q == n){
                ansp = i;
                break;
            }
            for(; j >= i; j -= plen)    used[f[j]] = false;
        }
        if(ansp == -1)      puts("NO");
        else{
            puts("YES");
            for(int i = ansp, q = 0; q < n; i += plen, q++){
                printf("%d", f[i]);
                if(q < n - 1)   putchar(' ');
            }
            puts("");
        }
    }
    return 0;
}