hash function for text in C
Silver
gaynor at paul.rutgers.edu
Thu Oct 18 15:13:09 AEST 1990
You must supply definitions of uint8 (an 8-bit unsigned int), uint16
(similarly), and uint32 (likewise).
----------------------------------- hash.h ------------------------------------
/* This is an implementation of Peter K. Pearson's hash algorithm as */
/* presented in "Fast Hashing of Variable-Length Text Strings", ACM 6/90. */
/* The conclusion of his article is cited below. */
/* */
/* The main advantages of the hashing function presented here are: */
/* 1. No restriction is placed on the length of the text string. */
/* 2. The length of the text string does not need to be known beforehand. */
/* 3. Very little arithmetic is performed on each character being hashed. */
/* 4. Similar strings are not likely to collide. */
/* 5. Minimal, perfect hashing functions can be built in this form. */
/* [6. It's average distribution is quite random.] */
/* */
/* Its principal disadvantages are: */
/* 1. Output value ranges that are not powers of 2 are somwehat more */
/* complicated to provide. */
/* 2. More auxiliary memory (the 256-byte table T) is required by this */
/* hashing function than by many traditional functions. */
/* [3. I blew 50 cents copying the article and then had to type it in.] */
/* */
/* Elaborating on advantage 5 above, by fatootsing with the permutation */
/* table T, one can make hash8 perform perfect minimal hashes. The author */
/* provides the following guidelines for construction of such a table: */
/* */
/* [C[N] are the characters of the string being hashed and h[N] is the Nth */
/* iteration of the loop.] */
/* */
/* 1. A table T was consructed by pseudorandom permutation of the integers */
/* (0 ... 255). */
/* 2. One by one, the desired values were assigned to the words in the */
/* list. Each assignment was effected by exchanging two elemnts in the */
/* table. */
/* 3. For each word, the first candidate considered for exchange was */
/* T[h[n-1] ^ C[n]], the last table element referenced in the */
/* computation of the hash function for that word. */
/* 4. A table element could not be exchanged if it was referenced during */
/* the hashing of a previously assgined word or if it was referenced */
/* earlier in the hashing of the same word. */
/* 5. If the necessary exchange was forbidden by Rule 4, attention was */
/* shifted to the previously referenced table element, */
/* T[[h[n-2]] ^ C[n-1]]. */
/* */
/* The procedure is not always successful. For example, using the ascii */
/* character codes, if the word "a" hashes to 0 and the word "i" hashes to */
/* 15, it turns out that the word "in" must hash to 0. Initial attempts */
/* to map Knuth's 31 words onto the integers (0 ... 30) failed for exactly */
/* this reason. The shift to the range (1 ... 31) was an ad hoc tactic to */
/* circumvent this problem. */
/* Hash s into an 8, 16, or 32 bit unsigned integer. The larger-sized */
/* values are generated by applying hash8 in parallel. */
extern uint8 hash8(uint8 * s);
extern uint16 hash16(uint8 * s);
extern uint32 hash32(uint8 * s);
/* The hash permutation table. */
extern uint32 * hashpermutation;
----------------------------------- hash.c ------------------------------------
#include "hash.h"
static uint8 permutation[256] =
{
51, 140, 233, 27, 118, 125, 170, 138, 119, 132, 174, 97, 25, 110, 1, 14,
65, 36, 40, 188, 73, 173, 7, 30, 68, 56, 169, 234, 107, 177, 197, 87,
28, 210, 186, 67, 2, 15, 115, 48, 223, 148, 211, 57, 190, 104, 213, 49,
144, 172, 147, 124, 157, 238, 167, 183, 78, 75, 58, 22, 70, 103, 181, 12,
254, 41, 198, 168, 46, 79, 241, 156, 83, 128, 66, 60, 86, 141, 161, 176,
221, 54, 192, 252, 116, 95, 206, 35, 88, 133, 154, 250, 237, 253, 85, 178,
93, 159, 155, 42, 9, 89, 3, 61, 201, 158, 106, 82, 240, 255, 218, 102,
189, 8, 33, 4, 145, 16, 150, 26, 99, 100, 195, 175, 34, 50, 80, 166,
194, 195, 164, 29, 134, 105, 55, 143, 122, 130, 245, 208, 72, 77, 64, 121,
139, 232, 191, 108, 228, 137, 59, 74, 11, 126, 171, 5, 242, 101, 239, 193,
112, 113, 98, 21, 207, 225, 151, 251, 92, 91, 17, 127, 20, 81, 24, 6,
43, 196, 204, 247, 212, 224, 220, 94, 32, 13, 187, 199, 214, 18, 226, 84,
71, 231, 165, 19, 202, 217, 90, 129, 136, 153, 182, 111, 244, 45, 236, 249,
109, 47, 180, 205, 215, 160, 53, 162, 114, 246, 179, 62, 227, 96, 142, 230,
184, 146, 117, 39, 69, 37, 23, 63, 52, 216, 0, 135, 149, 31, 38, 44,
209, 120, 76, 203, 229, 123, 131, 152, 10, 219, 243, 248, 235, 222, 200, 163
};
uint8 hash8(register uint8 * s)
{register static uint8 h;
if (*s)
{h = permutation[*s];
while (*++s)
h ^= permutation[h ^ *s];
return(h);}
else
{return((uint8)0);}
uint16 hash16(register uint8 * s);
{register uint8 h1;
register uint8 h2;
if (*s)
{h1 = permutation[*s];
h2 = permutation[*s + 1];
while (*++s)
{h1 ^= permutation[h1 ^ *s];
h2 ^= permutation[h2 ^ *s];}
return((uint16)((h1<<8) & h2));}
else
{return((uint16)0);}}
uint32 hash32(register uint8 * s);
{register uint8 h1;
register uint8 h2;
register uint8 h3;
register uint8 h4;
if (*s)
{h1 = permutation[*s]
h2 = permutation[*s + 1];
h2 = permutation[*s + 2];
h2 = permutation[*s + 3];
while (*++s)
{h1 ^= permutation[h1 ^ *s];
h2 ^= permutation[h2 ^ *s];
h3 ^= permutation[h3 ^ *s];
h4 ^= permutation[h4 ^ *s];}
return((uint32)((h1<<24) & (h2<<16) & (h3<<8) & h4));}
else
{return((uint32)0);}}
More information about the Comp.lang.c
mailing list