Created
February 28, 2017 08:47
-
-
Save tomonari-masada/a2a744a5b8cb19e0727f4efa62577352 to your computer and use it in GitHub Desktop.
extracting maximal substrings from UTF8 Japanese strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#define SEPCHAR '_' | |
#define MAXLEN 32 | |
#define BUFFSIZE 1000000 | |
#define TOKENLEN 8 | |
#define getUTF8CharSize(c) ((((c) & 0x80) == 0) ? 1 : ((((c) & 0xe0) == 0xc0) ? 2 : ((((c) & 0xf0) == 0xe0) ? 3 : ((((c) & 0xf8) == 0xf0) ? 0 : ((((c) & 0xfc) == 0xf8) ? 0 : ((((c) & 0xfe) == 0xfc) ? 0 : 0)))))) | |
int nNumOfChars; | |
char **sChars; | |
int getUTF8CharInt(char *s) | |
{ | |
char c = s[0]; | |
if ((c & 0x80) == 0) return (int) (c & 0x7f); | |
else if ((c & 0xe0) == 0xc0) | |
return ((int) ((s[0] & 0x1f) >> 2)) * 0x100 + ((int) ((s[0] & 0x3) << 6)) + ((int) (s[1] & 0x3f)); | |
else if ((c & 0xf0) == 0xe0) | |
return (((int) ((s[0] & 0xf) << 4)) + ((int) ((s[1] & 0x3c) >> 2))) * 0x100 | |
+ ((int) ((s[1] & 0x3) << 6)) + ((int) (s[2] & 0x3f)); | |
else if ((c & 0xf8) == 0xf0) return 0; | |
else if ((c & 0xfc) == 0xf8) return 0; | |
else if ((c & 0xfe) == 0xfc) return 0; | |
else return 0; | |
} | |
int getUTF8IntChar(int n, char *s) | |
{ | |
if (n <= 0x7f) { s[0] = (char) n; return 1; } | |
else if (n <= 0x7ff) { | |
s[0] = ((char) (n / 0x40)) | 0xc0; | |
s[1] = ((char) (n & 0x3f)) | 0x80; | |
return 2; } | |
else if (n <= 0xffff) { | |
s[0] = ((char) (n / 0x1000)) | 0xe0; | |
s[1] = ((char) ((n & 0xfff) / 0x40)) | 0x80; | |
s[2] = ((char) (n & 0x3f)) | 0x80; | |
return 3; } | |
else | |
return 0; | |
} | |
void fprintUTF8(FILE *f, int *p, int n) | |
{ | |
char s[6]; | |
int i; | |
if (n == 0) n = 1; | |
for (i = 0; i < n; i ++) { | |
if ((*(p)) == 0) return; | |
memset(s, 0, 6); | |
getUTF8IntChar(*(p ++), s); | |
fprintf(f, "%s", s); | |
} | |
return; | |
} | |
int getUTF8str(char *t, int *p, int n) | |
{ | |
char s[6]; | |
int i, l; | |
if (n == 0) n = 1; | |
l = 0; | |
for (i = 0; i < n; i ++) { | |
if ((*(p)) == 0) break; | |
memset(s, 0, 6); | |
getUTF8IntChar(*(p ++), s); | |
l += strlen(s); | |
if (l >= MAXLEN) return 1; | |
strcat(t, s); | |
} | |
i = 0; | |
while (t[i] != 0) if (t[i] == SEPCHAR) return 1; else i ++; | |
return 0; | |
} | |
// https://sites.google.com/site/yuta256/sais | |
unsigned char mask[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; | |
#define tget(i) ( (t[(i) / 8] & mask[(i) % 8]) ? 1 : 0 ) | |
#define tset(i, b) t[(i) / 8] = (b) ? (mask[(i) % 8] | t[(i) / 8]) : ((~ mask[(i) % 8]) & t[(i) / 8]) | |
#define chr(i) (cs == sizeof(int) ? ((int *) s)[i] : ((unsigned char *) s)[i]) | |
#define isLMS(i) (i > 0 && tget(i) && ! tget(i - 1)) | |
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) | |
{ | |
int i, sum = 0; | |
for (i = 0; i <= K; i ++) bkt[i] = 0; | |
for (i = 0; i < n; i ++) bkt[chr(i)] ++; | |
for (i = 0; i <= K; i ++) { sum += bkt[i]; bkt[i] = end ? sum : sum - bkt[i]; } | |
} | |
void induceSAl(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs) | |
{ | |
int i, j; | |
getBuckets(s, bkt, n, K, cs, false); | |
for (i = 0; i < n; i ++) { | |
j = SA[i] - 1; | |
if (j >= 0 && ! tget(j)) SA[bkt[chr(j)] ++ ] = j; | |
} | |
} | |
void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs) | |
{ | |
int i, j; | |
getBuckets(s, bkt, n, K, cs, true); | |
for (i = n - 1; i >= 0; i --) { | |
j = SA[i] - 1; | |
if (j >= 0 && tget(j)) SA[-- bkt[chr(j)]] = j; | |
} | |
} | |
void SAIS(unsigned char *s, int *SA, int n, int K, int cs) | |
{ | |
int i, j; | |
unsigned char *t = (unsigned char *) malloc(n / 8 + 1); | |
tset(n - 2, 0); tset(n - 1, 1); | |
for (i = n - 3; i >= 0; i --) | |
tset(i, (chr(i) < chr(i + 1) || (chr(i) == chr(i + 1) && tget(i + 1) == 1)) ? 1 : 0); | |
int *bkt = (int *) malloc(sizeof(int) * (K + 1)); | |
getBuckets(s, bkt, n, K, cs, true); | |
for (i = 0; i < n; i ++) SA[i] = -1; | |
for (i = 1; i < n; i ++) | |
if (isLMS(i)) SA[-- bkt[chr(i)]] = i; | |
induceSAl(t, SA, s, bkt, n, K, cs); | |
induceSAs(t, SA, s, bkt, n, K, cs); | |
free(bkt); | |
int n1 = 0; | |
for (i = 0; i < n; i ++) | |
if (isLMS(SA[i])) SA[n1 ++] = SA[i]; | |
for (i = n1; i < n; i ++) SA[i] = -1; | |
int name = 0, prev = -1; | |
for (i = 0; i < n1; i ++) { | |
int pos = SA[i]; bool diff = false; | |
int d; | |
for (d = 0; d < n; d ++) | |
if (prev == -1 || chr(pos + d) != chr(prev + d) || tget(pos + d) != tget(prev + d)) | |
{ diff = true; break; } | |
else if (d > 0 && (isLMS(pos + d) || isLMS(prev + d))) break; | |
if (diff) { name ++; prev = pos; } | |
pos = (pos % 2 == 0) ? pos / 2 : (pos - 1) / 2; | |
SA[n1 + pos] = name - 1; | |
} | |
for (i = n - 1, j = n - 1; i >= n1; i --) | |
if (SA[i] >= 0) SA[j -- ] = SA[i]; | |
int *SA1; int *s1; | |
SA1 = SA; s1 = SA + n - n1; | |
if (name < n1) | |
SAIS((unsigned char *) s1, SA1, n1, name - 1, sizeof(int)); | |
else | |
for (i = 0; i < n1; i ++) SA1[s1[i]] = i; | |
bkt = (int *) malloc(sizeof(int) * (K + 1)); | |
getBuckets(s, bkt, n, K, cs, true); | |
for (i = 1, j = 0; i < n; i ++) | |
if (isLMS(i)) s1[j ++] = i; | |
for (i = 0; i < n1; i ++) SA1[i] = s1[SA1[i]]; | |
for (i = n1; i < n; i ++) SA[i] = -1; | |
for (i = n1 - 1; i >= 0; i --) { | |
j = SA[i]; SA[i] = -1; | |
SA[-- bkt[chr(j)]] = j; | |
} | |
induceSAl(t, SA, s, bkt, n, K, cs); | |
induceSAs(t, SA, s, bkt, n, K, cs); | |
free(bkt); free(t); | |
} | |
void GetHeight(unsigned char *s, int *SA, int *Height, int n, int cs) | |
{ | |
int i; | |
int *r = (int *) malloc(sizeof(int) * n); // rank in SA | |
for (i = 0; i < n; i ++) r[SA[i]] = i; | |
int h = 0; | |
for (i = 0; i < n; i ++) | |
if (r[i] > 0) { | |
int j = SA[r[i] - 1]; | |
while (chr(i + h) == chr(j + h)) h ++; | |
Height[r[i] - 1] = h; | |
if (h > 0) h --; | |
} | |
Height[n - 1] = 0; | |
return; | |
} | |
void ChildTable(int *Height, int *Up, int *Down, int n) | |
{ | |
int i; | |
int *stack = (int *) malloc(sizeof(int)); | |
int size; | |
int last = -1; | |
for (i = 0; i < n; i ++) { Up[i] = Down[i] = -1; } | |
size = 1; | |
stack[size - 1] = 0; | |
for (i = 1; i < n; i ++) { | |
while (Height[i] < Height[stack[size - 1]]) { | |
last = stack[size - 1]; | |
size --; | |
if ((Height[i] <= Height[stack[size - 1]]) && (Height[stack[size - 1]] != Height[last])) | |
Down[stack[size - 1]] = last; | |
} | |
if (last != -1) { | |
Up[i] = last; | |
last = -1; | |
} | |
size ++; | |
stack = (int *) realloc(stack, sizeof(int) * size); | |
stack[size - 1] = i; | |
} | |
free(stack); | |
return; | |
} | |
void NextIndex(int *Height, int *Next, int n) | |
{ | |
int i; | |
int last; | |
int *stack = (int *) malloc(sizeof(int)); | |
int size = 0; | |
for (i = 0; i < n; i ++) { Next[i] = -1; } | |
size = 1; | |
stack[size - 1] = 0; | |
for (i = 1; i < n; i ++) { | |
while (Height[i] < Height[stack[size - 1]]) size --; | |
if (Height[i] == Height[stack[size - 1]]) { | |
last = stack[size - 1]; | |
size --; | |
Next[last] = i; | |
} | |
size ++; | |
stack = (int *) realloc(stack, sizeof(int) * size); | |
stack[size - 1] = i; | |
} | |
free(stack); | |
return; | |
} | |
int nDocs; | |
int *nPos2Doc; | |
int GetInterval(int *input, int *SA, int *BWT, int *height, int *up, int *down, int *next, int n1, int n2) | |
{ | |
int k; | |
char sStr[MAXLEN]; | |
if (n1 != n2) { | |
int i, j, k2; | |
if ((n1 <= up[n2]) && (up[n2] < n2)) | |
i = up[n2]; | |
else | |
i = down[n1 - 1]; | |
k2 = BWT[n1]; | |
k = GetInterval(input, SA, BWT, height, up, down, next, n1, i); | |
if ((k == 0) || (k != k2)) k = 0; | |
j = next[i]; | |
while (j != -1) { | |
k2 = k; | |
k = GetInterval(input, SA, BWT, height, up, down, next, i + 1, j); | |
if ((k == 0) || (k != k2)) k = 0; | |
i = j; | |
j = next[i]; | |
} | |
k2 = k; | |
k = GetInterval(input, SA, BWT, height, up, down, next, i + 1, n2); | |
if ((k == 0) || (k != k2)) k = 0; | |
if (k == 0) { | |
int l = height[n1]; | |
i = n1 + 1; | |
while (i < n2) { | |
if (height[i] < l) l = height[i]; | |
i ++; | |
} | |
for (i = n1; i <= n2; i ++) { | |
/* | |
printf("%d\t%d\t", nPos2Doc[SA[i]], SA[i]); | |
fprintUTF8(stdout, &(input[SA[n1]]), l); | |
printf("\n"); | |
*/ | |
memset(sStr, 0, MAXLEN); | |
if (! getUTF8str(sStr, &(input[SA[n1]]), l)) | |
printf("%d\t%d\t%s\n", nPos2Doc[SA[i]], SA[i], sStr); | |
} | |
} | |
} else { | |
k = BWT[n1]; | |
} | |
return k; | |
} | |
int main() | |
{ | |
char sBuff[BUFFSIZE]; | |
int nCharSize; | |
int nCount; | |
int nLen; | |
int *nInput; | |
int *nSA; | |
int *nBWT; | |
int *nHeight; | |
int *nUp; | |
int *nDown; | |
int *nNext; | |
nLen = 0; | |
nInput = (int *) malloc(sizeof(int)); | |
nDocs = 0; | |
nPos2Doc = (int *) malloc(sizeof(int)); | |
while (! feof(stdin)) { | |
memset(sBuff, 0, BUFFSIZE); | |
fgets(sBuff, BUFFSIZE - 1, stdin); | |
nCount = 0; | |
while (sBuff[nCount] != 0 && sBuff[nCount] != '\r' && sBuff[nCount] != '\n') { | |
nCharSize = getUTF8CharSize(sBuff[nCount]); | |
if (nCharSize > 3) { fprintf(stderr, "a character of length more than 3 bytes\n"); exit(1); } | |
else if (nCharSize > 0) { | |
nLen ++; | |
nInput = (int *) realloc(nInput, sizeof(int) * nLen); | |
nInput[nLen - 1] = getUTF8CharInt(&(sBuff[nCount])); | |
nCount += nCharSize; | |
nPos2Doc = (int *) realloc(nPos2Doc, sizeof(int) * nLen); | |
nPos2Doc[nLen - 1] = nDocs; | |
} | |
} | |
nLen ++; | |
nInput = (int *) realloc(nInput, sizeof(int) * nLen); | |
nInput[nLen - 1] = SEPCHAR; | |
nCount ++; | |
nPos2Doc = (int *) realloc(nPos2Doc, sizeof(int) * nLen); | |
nPos2Doc[nLen - 1] = nDocs; | |
nDocs ++; | |
} | |
nLen ++; | |
nInput = (int *) realloc(nInput, sizeof(int) * nLen); | |
nInput[nLen - 1] = 0; | |
nSA = (int *) malloc(sizeof(int) * nLen); | |
SAIS((unsigned char *) nInput, nSA, nLen, 0xffff, sizeof(int)); | |
nBWT = (int *) malloc(sizeof(int) * nLen); | |
for (nCount = 0; nCount < nLen; nCount ++) | |
if (nSA[nCount] > 0) nBWT[nCount] = nInput[nSA[nCount] - 1]; | |
else nBWT[nCount] = nInput[nLen - 1]; | |
nHeight = (int *) malloc(sizeof(int) * nLen); | |
GetHeight((unsigned char *) nInput, nSA, nHeight, nLen, sizeof(int)); | |
nUp = (int *) malloc(sizeof(int) * nLen); | |
nDown = (int *) malloc(sizeof(int) * nLen); | |
nNext = (int *) malloc(sizeof(int) * nLen); | |
ChildTable(nHeight, nUp, nDown, nLen); | |
NextIndex(nHeight, nNext, nLen); | |
/* | |
for (nCount = 0; nCount < nLen; nCount ++) { | |
char sChar[6]; | |
memset(sChar, 0, 6); | |
getUTF8IntChar(nBWT[nCount], sChar); | |
printf("%d\t%d\t%s\t%d\t%d\t%d\t%d\t", nCount, nSA[nCount], sChar, | |
nHeight[nCount], nUp[nCount], nDown[nCount], nNext[nCount]); | |
fprintUTF8(stdout, &(nInput[nSA[nCount]]), 20); | |
printf("\n"); | |
} | |
*/ | |
nCount = 1; | |
do { | |
int nCount2 = nCount + 1; | |
while ((nCount2 < nLen) && (nInput[nSA[nCount]] == nInput[nSA[nCount2]])) | |
nCount2 ++; | |
GetInterval(nInput, nSA, nBWT, nHeight, nUp, nDown, nNext, nCount, nCount2 - 1); | |
nCount = nCount2; | |
} while (nCount < nLen); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Input format:
Each document in each line.
Output format:
[doc id] [position] [maximal substring]