perfect hash function finder
Keith Bostic
keith at seismo.UUCP
Thu Sep 27 08:03:46 AEST 1984
History:
On 10/7/83 C. Havener (charlie at genrad.UUCP) put a perfect hash
function finding program on the net. On 11/4/84 R. Dietrich
(robertd at tektronix.UUCP) put a Pascal version of the same program
on the net. Both of these programs were based on "More on Minimal
Perfect Hash Tables," Colorado State University Technical Report,
April 1981, by Cook, Curtis R. and Oldehoeft, R. R., and "Minimal
Perfect Hash Functions Made Simple" by Richard J. Cichelli - Comm.
of ACM Jan 1980.
These programs had the following limitations, some of which they shared,
some of which they didn't:
-- limited number of strings (40 or 110) per hash table.
-- slow to extremely slow run-time (large amounts of both
recursion and parallel array processing).
-- a requirement that the hash strings be in a certain order.
-- a fixed hashing algorithm.
-- inability to handle strings that hashed to identical values.
In a recent program I was doing, I had to hash approximately 500 different
keywords, preferably in one access. To do this, I ended up rewriting the
above programs. The attached version has fixed some of the limitations.
-- no keyword limit; I've used it to hash up to about 2400 entries.
-- it's several orders of magnitude faster.
-- it's fairly indifferent to the order the strings are presented in.
-- you can specify the keys to be hashed.
-- it handles strings that hash to identical values.
The basic algorithm is:
hash = the length of the string plus the sum of all of the
associated values of the key characters.
The program assigns values to the characters it is using for
hashing until a set of the values gives each keyword a unique
value.
For example: If the string was "thekid" and the key characters were
't' and 'd', with 't' having a value of 5 and 'd' having a value of
8, the hash value of "thekid" would be 19; 8 + 5 + strlen("thekid");
It takes two arguments --
-d -- print out some interesting debugging information.
-k -- use the following characters of the string in the hashing algorithm.
If the program was called as "perf -k137" and the string "abcdefgh"
was entered, the characters 'a', 'c', and 'g' would be used by the
hashing scheme. To make this easier, the program prints out a routine
at the end of its run that, when passed a string, will return the
correct hash value. Also, the special character '$' tells the program
to use the last character of every string. Default, if the k flag
isn't specified, is "-k1$", i.e. the first and last character of every
string.
followed by a filename containing the keywords.
Ex: suppose there's a file "foo" containing the keywords:
======================================================================
political
man
can
have
as
his
aim
the
realization
of
freedom
======================================================================
the command "perf foo" produces the following:
======================================================================
52 -- political
44 -- man ** the keywords are listed
45 -- can with their hash values.
33 -- have
19 -- as
39 -- his
14 -- aim
26 -- the
61 -- realization
50 -- of
43 -- freedom
perf: 11 keywords.
******
static long ** followed by a routine that
hash(str) will return that value if
register char *str; passed the keyword as an argument.
{
static int cval[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 12,
0, 10, 25, 0, 19, 0, 0, 0, 27, 11,
30, 23, 16, 0, 20, 17, 13, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
};
register long hval;
register int len;
switch(hval = (long)(len = strlen(str))) {
default:
hval += cval[(int)str[0]];
hval += cval[(int)str[len - 1]];
case 0:
return(hval);
}
}
======================================================================
Well, guess that's about it, I'd be interested in any comments, fixes,
whatever.
Enjoy, Keith Bostic
ARPA: keith at seismo
UUCP: seismo!keith
.............. cut on the dotted line .................................
#include <stdio.h>
#include <ctype.h>
/*
* Find a "perfect" hashing function.
* Args:
* d: turn on debugging.
* k: specify chars to use; '$' means use last character.
* Tuneable defines:
* TRYLIM: how many guesses each character change gets
* TRYNUM: how many characters to try per conflict
* RANGE: how big a range to distribute numbers over
* Bugs, fixes, etc:
* Keith Bostic keith at seismo.ARPA
* seismo!keith.UUCP
*/
#define CSIZE 127 /* hold all possible characters */
#define JVAL 3 /* how much to jump when trying values */
#define MAXKEY 15 /* maximum length of key word */
#define NO 0 /* general no, not okay */
#define RANGE 3 /* range to scatter keys over */
#define TRYLIM 10 /* how many attempts to forecast */
#define TRYNUM 5 /* how many chars to attempt */
#define YES 1 /* general yes, okay */
#define KEY struct key /* structure to hold hash strings */
KEY {
char *word; /* hash string itself */
long hval; /* hash value itself */
short set[MAXKEY + 1], /* set of chars to hash */
len, /* length of hash string */
link; /* if hard linked to another key */
} *begkey, /* start of hash strings */
*endkey, /* end of hash strings */
*b_first, /* best, first key hit */
*cur; /* current hash string */
static int b_hit, /* best hit value so far */
nkeys, /* number of keys given */
split, /* how wide a spread for key values */
cval[CSIZE], /* associated character values */
use[CSIZE]; /* number of times character used */
static char *myname; /* name program called with */
static short b_new, /* best increment value so far */
b_val, /* char best stuff found for */
debug, /* if debugging wanted */
pk[11] = { 1, -1, 0 }; /* keys to look at, default 1,$ */
main(argc,argv)
int argc;
char **argv;
{
register KEY *K;
myname = *argv;
parse(argc,argv);
keyread();
setlink();
setuse();
srand(1);
hash(begkey);
for (cur = begkey + 1;cur < endkey;++cur)
if (!cur->link) {
if (debug) printf("current key: <%s>\n",cur->word);
hash(cur);
for (K = begkey;K < cur;++K)
if (!K->link && K->hval == cur->hval) {
if (debug) printf("\t<%s> <%s> hashed to <%D>\n",K->word,cur->word,K->hval);
change(K);
break;
}
}
pvals();
pcode();
}
static /* change a character value */
change(arg) /* try least-used characters first */
KEY *arg;
{
register short *S1,
*S2,
*S3;
register KEY *K;
short val1[2 * MAXKEY + 1],
val2[MAXKEY + 1];
/* copy lists of values */
for (S1 = val1,S2 = arg->set;*S1++ = *S2++;);
for (S1 = val2,S2 = cur->set;*S1++ = *S2++;);
for (S1 = val1;*S1;++S1) /* if value in both keys equal */
for (S2 = val2;;++S2) /* number of times, ignore it */
if (!*S2) break;
else if (*S2 == *S1) {
*S2 = *S1 = -1;
break;
} /* clean out repeats/undefs in val1 */
for (S1 = S2 = val1;*S1 = *S2;++S2) {
if (*S1 != -1) ++S1;
for (S3 = S2 + 1;*S3;++S3)
if (*S3 == *S2) *S3 = -1;
}
for (S2 = val2;*S1 = *S2;++S2) {
if (*S1 != -1) ++S1;
for (S3 = S2 + 1;*S3;++S3)
if (*S3 == *S2) *S3 = -1;
} /* S3 now storage or counter */
for (S1 = val1,S3 = val2;*S1;++S1) /* sort by usage, bubble */
for (S2 = S1 + 1;*S2;++S2) /* since max MAXKEY * 2 items */
if (use[*S1] > use[*S2]) {
*S3 = *S1;
*S1 = *S2;
*S2 = *S3;
}
b_hit = nkeys + 1;
for (*S3 = 0,S1 = val1;*S1 && *S3 < TRYNUM;++S1,++*S3)
if (!fore(*S1)) return; /* try changing some values */
if (debug) printf("\tused <%c> with <%d> hits.\n",(char)b_val,b_hit);
cval[b_val] = b_new; /* take best value */
for (K = begkey;K < b_first;++K) if (!K->link) hash(K);
cur = b_first - 1; /* reset current value */
}
static /* find out how character value change will effect */
fore(val) /* already successfully hashed items */
short val;
{
register KEY *K1,
*K2;
register short *S;
register int hit;
KEY *K_frst;
int save;
short cnt;
save = cval[val]; /* save original values */
cval[val] = rand() % split; /* get a new one */
if (debug) printf("\ttrying <%c> used <%d> times -- ",(char)val,use[val]);
for (cnt = 0;cnt < TRYLIM;++cnt) {
for (K1 = begkey;K1 <= cur;++K1) hash(K1);
K_frst = (KEY *)NULL;
for (hit = 0,K1 = begkey;K1 < cur;++K1) {
if (K1->link) continue;
for (S = K1->set;*S && *S != val;++S);
if (!*S) continue;
for (K2 = K1 + 1;K2 <= cur;++K2)
if (!K2->link && K1->hval == K2->hval) {
if (++hit >= b_hit) goto loop;
if (!K_frst) K_frst = K2;
}
}
b_new = cval[b_val = val];
b_first = K_frst;
if (!(b_hit = hit)) {
if (debug) puts("0");
return(NO); /* no hits, use it */
}
loop: if (debug) printf("%d ",hit);
cval[val] += JVAL;
}
cval[val] = save; /* restore original values */
if (debug) putchar('\n');
return(YES);
}
static
keyread() /* get key words */
{
register short *S1,
*S2;
register KEY *K;
char name[MAXKEY],
*gets(), *malloc(), *strcpy();
while (gets(name)) if (*name) ++nkeys;
if (!nkeys) die("no key words supplied.");
rewind(stdin);
if (!(begkey = (KEY *)malloc((unsigned)(sizeof(KEY) * nkeys)))) die("malloc failure.");
for (K = begkey;gets(name);) {
if (!*name) continue;
if (!(K->word = malloc((unsigned)((K->len = strlen(name)) + 1)))) die("malloc failure.");
strcpy(K->word,name);
for (S1 = K->set,S2 = pk;*S2;++S2)
if (*S2 == -1) *S1++ = (short)K->word[K->len - 1];
else if (K->len >= *S2) *S1++ = (short)K->word[*S2 - 1];
if (S1 == K->set) {
fprintf(stderr,"%s: can't hash keyword %s with %s.\n",myname,name,pk);
exit(1);
}
if (debug) printf("read <%s>\n",K->word);
++K;
}
endkey = begkey + nkeys;
split = nkeys * RANGE;
}
static
hash(K) /* actually hash a string */
register KEY *K;
{
register short *S;
for (K->hval = K->len,S = K->set;*S;++S) K->hval += cval[*S];
}
static
pvals() /* print out hashed values */
{
register KEY *K1,
*K2;
register short cnt;
int nlink = 0;
if (debug) {
for (cnt = 0;cnt < CSIZE;++cnt)
if (use[cnt])
if (isprint((char)cnt)) printf("%-3c -- %d\n",(char)cnt,cval[cnt]);
else printf("<%o> -- %d",cnt,cval[cnt]);
}
for (K1 = begkey;K1 < endkey;++K1) hash(K1);
for (K1 = begkey;K1 < endkey;++K1) {
if (K1->hval == -1) continue;
printf("%-10D -- %s",K1->hval,K1->word);
for (K2 = K1 + 1;K2 < endkey;++K2)
if (K2->hval == K1->hval) {
++nlink;
putc(' ',stdout);
fputs(K2->word,stdout);
K2->hval = -1;
}
putc('\n',stdout);
}
printf("\n%s: %d keywords.\n",myname,nkeys);
if (nlink) printf("%s: %d linked items.\n",myname,nlink);
}
static
parse(nargc,nargv) /* deal with arguments */
int nargc; /* set debug flag */
char **nargv; /* open keyword file for reading */
{
extern int optind;
extern char *optarg;
register short *S;
register char *C;
register int c;
while ((c = getopt(nargc,nargv,"dk:")) != EOF)
switch((char)c) {
case 'd': /* debug flag */
debug = YES;
printf("starting %s.\n",myname);
break;
case 'k': /* number of keys */
for (C = optarg,S = pk;*C;++C)
if (*C <= '9' && *C > '0') *S++ = *C - '0';
else if (*C == '$') *S++ = -1;
else die("illegal key value. Use 1-9 or $.");
if (S == pk) die("no selected keys.");
*S = 0;
break;
default:
fprintf(stderr,"usage: %s [-d] [-k keys] file\n",myname);
exit(1);
}
if (!*nargv[optind] || !freopen(nargv[optind],"r",stdin)) die("unable to read key word file.");
}
static
die(msg) /* die with some last words */
register char *msg;
{
fprintf(stderr,"%s: %s\n",myname,msg);
exit(1);
}
static
setlink() /* get rid of unhashables */
{ /* if keys are identical (in any order) */
register short *S1, /* and length is the same, it's a link */
*S2;
register KEY *K1,
*K2;
short hold[MAXKEY + 1];
for (K1 = begkey;K1 < endkey - 1;++K1)
if (K1->link) continue;
else for (K2 = K1 + 1;K2 < endkey;++K2) {
if (K2->link || K2->len != K1->len) continue;
for (S1 = hold,S2 = K2->set;*S1++ = *S2++;);
for (S1 = K1->set;;++S1) {
/* lengths are equal */ if (!*S1) {
/* so keys must be too */ K2->link = YES;
if (debug) printf("linked <%s> to <%s>\n",K2->word,K1->word);
break;
}
for (S2 = hold;*S2 && *S2 != *S1;++S2);
if (*S2) *S2 = -1;
else break;
}
}
}
setuse() /* initialize how many times a character is used */
{ /* initialize values of those characters */
register KEY *K;
register short *S,
cnt;
for (K = begkey;K < endkey;++K)
if (!K->link)
for (S = K->set;*S;++S) ++use[*S];
for (cnt = 0;cnt < CSIZE;++cnt)
if (use[cnt]) cval[cnt] = rand() % split;
}
static
pcode() /* print out the code to hash */
{
register short *S1,
*S2;
register short cnt;
char hold;
short dollar = NO;
for (S1 = pk;*S1;++S1)
if (*S1 == -1) {
dollar = YES;
break;
}
fputs("\n******\n\nstatic long\nhash(str)\nregister char\t*str;\n{\n\tstatic int cval[] = {",stdout);
for (cnt = 0;cnt < CSIZE;++cnt) {
if (!(cnt % 10)) fputs("\n\t\t",stdout);
printf("%d, ",cval[cnt]);
}
fputs("\n\t};\n\tregister long\thval;\n",stdout);
if (dollar) fputs("\tregister int\tlen;\n\n\tswitch(hval = (long)(len = strlen(str)))",stdout);
else fputs("\n\tswitch(hval = strlen(str))",stdout);
fputs(" {\n\t\tdefault:\n",stdout);
for (S1 = pk;*S1;++S1)
for (S2 = S1 + 1;*S2;++S2)
if (*S2 < *S1) {
hold = *S2;
*S2 = *S1;
*S1 = hold;
}
cnt = *--S1;
printf("\t\t\thval += cval[(int)str[%d]];\n",*S1 - 1);
for (--S1;S1 >= pk;--S1)
if (*S1 == -1) printf("\t\t\thval += cval[(int)str[len - 1]];\n",*S1);
else {
while (--cnt > *S1) printf("\t\tcase %d:\n",cnt);
printf("\t\tcase %d:\n\t\t\thval += cval[(int)str[%d]];\n",*S1,*S1 - 1);
}
fputs("\t\tcase 0:\n\t\t\treturn(hval);\n\t}\n}\n",stdout);
}
More information about the Comp.sources.unix
mailing list