Minimal perfect hash table generator
rro at csu-cs.UUCP
rro at csu-cs.UUCP
Fri Oct 7 02:19:27 AEST 1983
This is the program I promised in a recently posted article in net.lang.
hplabs!csu-cs!rro
program lhashd( input, output );
(* minimal perfect hash table generation program
input: a list of words which are to be organized into a dense
list so that the hash function
f(word) = length(word) + value(firstletter(word)) +
value(lastletter(word))
will find the word in one probe, or the word is not in
the list.
output: the values associated with each letter
the minimal perfect hash table containing the words.
reference: cook, curtis r. and oldehoeft, r. r., "more on minimal
perfect hash tables," colorado state university technical
report, april 1981.
*)
const maxwordsize = 16; (* up to 16-char words *)
maxlistsize = 110; (* up to 110 input words *)
maxhashsize = 400; (* hash table max index = 400 *)
cha = 'a';
chz = 'z';
type listsize = 1..maxlistsize;
wordsize = 1..maxwordsize;
letters = cha..chz;
hashsize = 1..maxhashsize;
wordinfo = record
row,col : char;
index,length : integer
end;
freqinfo = record
letter : char;
num : integer
end;
parray = packed array[wordsize] of char;
var graph : array[listsize] of wordinfo; (* word triples *)
order : array[1..26] of char; (* decreasing freq letters *)
pnames : array[listsize] of parray; (* input words *)
name : array[wordsize] of char;
lorder : array[letters] of integer; (* index of ordered letters *)
values : array[letters] of integer; (* letter values *)
freq : array[1..26] of freqinfo; (* letter frequencies *)
slot : array[1..30] of integer;
hashtable: array[hashsize] of integer; (* perfect hash table *)
diagonal : array[letters] of boolean; (* occur twice in a word *)
twordrec : wordinfo;
temprec : freqinfo;
(* miscellaneous constants *)
i, j, k, size, pos, len, lastletter, numslots, point,
val, entime, sttime, lowest, highest: integer;
ch, ch1, ch2: char;
(**************** procedure initialize ************************)
procedure initialize;
begin
for i := 1 to maxlistsize do
graph[i].length := 0;
for ch := cha to chz do
begin
values [ch] := 0;
diagonal [ch] := false
end;
for i := 1 to maxhashsize do
hashtable [i] := 0;
lowest := 100;
highest := 0;
k := ord('a') - 1;
for i := 1 to 26 do
begin
k := k +1;
freq [i].num := 0;
freq [i].letter := chr(k)
end
end; (* initialize *)
(******************** procedure readnames ******************************
readnames inputs words separated by commas or blanks.
finds first and last letter and length of each word.
computes frequency of each letter (number of times it occurs as first or
last letter in a word).
pnames[i]............ i th word
graph[i].row ........ first letter of i th word
graph[i].col ........ last letter of i th word
graph[i].length...... length of i th word
diagonal[ch] ........ true if character ch is first and last letter of
same word; otherwise it is false.
freq[i]......num .... frequency of i th letter ( a = 1, b = 2,...)
size ................ number of words input
*)
procedure readnames;
var current : integer;
begin
current := 0;
k := ord('a')-1;
read(ch);
while (ch = ' ') or (ch = ',') do
read(ch);
while not eof do
begin
current := current + 1;
len := 0;
repeat
len := len + 1;
name[len] := ch;
read(ch);
until (ch = ' ') or (ch = ',');
(* blank fill rest of name and place name in pnames *)
for i := len + 1 to maxwordsize do
name[i] := ' ';
pack(name,1,pnames[current]);
ch1 := name[1];
ch2 := name[len];
if ch1 = ch2
then
diagonal [ch1] := true;
i := ord(ch1) - k;
freq[i].num := freq[i].num + 1;
i := ord(ch2) - k;
freq[i].num := freq[i].num + 1;
graph[current].row := ch1;
graph[current].col := ch2;
graph[current].length := len;
graph[current].index := current;
while ((ch = ' ') or (ch = ',')) and (not eof) do
read(ch);
end;
size := current
end; (* readnames *)
(****************** procedure findorder ************************
findorder computes an ordering of the letters based on following
algorithm:
1. if letter appears as first and last letter of same word add 100
to its frequency.
2. multiply frequency of each letter by 100.
3. for each letter and each word it appears in, add the original fre-
quency of the other letter in the word to the frequency of the
letter.
4. sort letters in order of decreasing frequency.
order[i] ............. letter in i th position in ordering
lorder[ch] ........... position of letter ch in ordering
*)
procedure findorder;
label
99;
var
max,j,j1,j2 : integer;
tarray : array [1..26] of integer;
begin (* order by frequency *)
k := ord('a') -1;
for ch := cha to chz do
begin
j := ord(ch) -k;
tarray[j] := freq[j].num;
if diagonal[ch]
then
freq[j].num := freq[j].num + 100;
freq[j].num := freq[j].num * 100
end;
for i := 1 to size do
begin
j1 := ord(graph[i].row) - k;
j2 := ord(graph[i].col) - k;
freq[j1].num := freq[j1].num + tarray[j2];
freq[j2].num := freq[j2].num + tarray[j1];
end;
for i := 1 to 25 do
begin
max := i;
for j := i + 1 to 26 do
if freq [j].num > freq [max].num
then
max := j;
if freq [max].num = 0
then goto 99;
if max <> i
then
begin
temprec := freq[ i ];
freq [ i ] := freq [max];
freq [max] := temprec
end;
end;
99: i := 1;
while i <= 26 do
begin
if freq[i].num > 0
then
begin
ch := freq [i].letter;
order [i] := ch;
lorder [ch] := i;
lastletter := i;
i := i + 1
end
else
i := 27
end
end; (* findorder *)
(*************************** procedure ordergraph **********************
each word is represented by a triple (first letter,last letter,length) or
(graph[i].row, graph[i].col, graph[i].length). for each triple, inter-
change first letter and last letter if last letter precedes first letter
in letter ordering. then sort triples according to the letter ordering
using graph[i].col as key.
*)
procedure ordergraph;
begin
(* order row col pairs *)
for i := 1 to size do
if lorder [ graph [i].col ] < lorder[graph[i].row]
then
begin
ch := graph[i].col;
graph[i].col := graph[i].row;
graph[i].row := ch
end;
(* order points of graph *)
pos := 0;
for j := 1 to lastletter do
begin
ch := order[j];
for i := pos + 1 to size do
if graph[i].col = ch
then
begin
pos := pos + 1;
twordrec := graph[i];
graph[i] := graph[pos];
graph[pos] := twordrec
end;
end;
i := 1;
while i <= size do
begin
ch := graph[i].col ;
if diagonal[ch]
then
begin
j := i + 1;
while graph[j].col = ch do
j := j + 1;
k := i;
i := j;
j := j -1;
while k < j do
begin
if graph[k].row = ch
then
begin
twordrec := graph[j];
graph[j] := graph[k];
graph[k] := twordrec;
j := j -1
end;
k := k + 1
end
end
else
i := size + 1
end
end; (* ordergraph *)
procedure backup (var ch:char; var point : integer);
forward;
(************************ function checkdiag ****************************
function checkdiag(ch) returns the value false if ch is the first and
last letter of the same word and the value assignment for ch hashes
one of these words to a non-empty slot. otherwise it returns true.
*)
function checkdiag(ch:char;val,point:integer): boolean;
var check : boolean;
begin
check := true;
if diagonal[ch]
then
begin
len := graph[point].length;
while graph[point].col = ch do
begin
check := check and (hashtable[2*val + len] = 0);
point := point + 1;
end
end;
checkdiag := check;
end; (* checkdiag *)
(************************* procedure undo ********************************
undo(ch) deletes the value assignment for letter ch and all associated
hash table entries for triples with middle component ch.
*)
procedure undo(ch:char);
label 99;
var j : integer;
begin (* undo *)
point := point -1;
val := values[ch];
while graph[point].col = ch do
begin
j := graph[point].length + values[graph[point].row] + val;
hashtable[j] := 0;
if j = lowest then
if lowest = highest then
begin lowest := 100; highest := 0; goto 99 end
else
while hashtable[lowest] = 0 do lowest := lowest + 1
else
if j = highest then
while hashtable[highest] = 0 do highest := highest - 1;
point := point - 1
end;
point := point + 1;
99:end; (* undo *)
(************************* procedure findchval ***************************
for all triples with middle component ch, findchval attempts to find
a value assignment for ch so that all the triples hash to empty hash
table slots. findchval begins the search with the starting value start.
result is returned via val. findchval is successful if the value re-
turned via is not -1000. if val = -1000 then no value assignment is
possible because two of the triples have the same other letter value
and length sum.
*)
procedure findchval (ch:char;start,point:integer;var val:integer);
label 99;
var
empty : boolean;
i,j,min,temp : integer;
begin
numslots := 0;
while (graph[point].col = ch) and (graph[point].row <> ch) and
(point <= size) do
begin
numslots := numslots + 1;
slot[numslots] := graph[point].length + values[graph[point].row];
point := point + 1
end;
if numslots > 0
then
begin
if numslots > 1
then
begin
for i := 1 to numslots do
begin
min := i;
for j := i + 1 to numslots do
if slot[j] < slot[min]
then
min := j;
if min <> i
then
begin
temp := slot[min];
slot[min] := slot[i];
slot[i] := temp
end
end;
for i := 1 to numslots - 1 do
if slot[i] = slot[i+1]
then
begin
point := point - 1;
while graph[point].length +
values[graph[point].row] <> slot[i] do
point := point - 1;
ch1 := graph[point].row;
point := point - 1;
while graph[point].length +
values[graph[point].row] <> slot[i] do
point := point - 1;
ch2 := graph[point].row;
val := -1000;
goto 99
end
end;
(* handle special case of letter with frequency one *)
j := lorder[ch];
if freq[j].num < 200
then
start := -slot[1] + 1 + lowest;
empty := false;
val := start -1;
while not empty do
begin
val := val + 1;
empty := true;
for k := 1 to numslots do
empty := empty and (hashtable[val+slot[k]] = 0);
empty := empty and checkdiag(ch,val,point);
end; (* while *)
end (* then *)
else
begin
val := start;
while not checkdiag(ch,val,point) do
val := val + 1
end;
99:end; (* findchval *)
(*********************** procedure createtable ****************************
createtable finds letter value assignments and makes hash table entries
for a minimal perfect hash. it does the letter value assignments by
considering all groups of triples with the same middle component begin-
ning with the first letter in the letter ordering.
*)
procedure createtable;
label 99 , 98 ;
begin
point := 1;
i := 1;
while i <= lastletter do
begin
ch := order[i];
findchval(ch,0,point,val);
98: if val = -1000
then
begin
repeat
i := i -1;
ch := order[i];
undo(ch);
until (ch = ch1) or (ch = ch2);
val := values[ch] + 1;
findchval(ch,val,point,val);
goto 98
end;
values[ch] := val;
while graph[point].col = ch do
begin
j := graph[point].length + values[graph[point].row] + val;
if ( j <= highest - size)
then
begin
undo(ch);
findchval(ch,val+1,point,val);
goto 98
end;
if (j >= lowest + size)
then
begin
backup(ch,point);
goto 98
end
else
if hashtable[j] <> 0
then
begin
ch := graph[point].col;
undo(ch);
val := values[ch];
findchval(ch,val+1,point,val);
goto 98
end
else
begin
hashtable[j] := graph[point].index;
if j < lowest
then
lowest := j;
if j > highest
then
highest := j;
if point >= size
then
goto 99
else
point := point + 1
end
end;
i := i + 1
end;
99:end; (* createtable *)
(************************* procedure backup ********************************
letter value assignment returned by findchval does not yield minimal
table. backup undoes value assignment and any associated hash table
entries for the group of triples with the current letter and the group of
triples for the previous letter.
*)
procedure backup;
begin
if graph[point-1].col = ch
then
undo(ch);
repeat
i := i-1;
ch := order[i];
until graph[point -1].col = ch;
undo(ch);
findchval(ch,val+1,point,val);
values[ch] := val
end; (* backup *)
begin (* main program *)
initialize;
readnames;
findorder;
ordergraph;
sttime := clock;
createtable;
entime := clock;
writeln(' execution time = ', (entime-sttime));
writeln(' letter value ');
for ch := cha to chz do
writeln(' ',ch,values[ch] :8);
for i := lowest to highest do
begin
j := hashtable[i];
writeln(' hash table[',i:3,'] ',pnames[j])
end
end. (* main program *)
More information about the Comp.sources.unix
mailing list