Bit counting
Bennett Todd
bet at dukeac.UUCP
Wed Jan 18 17:59:51 AEST 1989
: This is a sharchive -- extract the files by running through sh.
echo x - README
sed 's/^X//' <<\Shar_Eof >README
XI wanted to see for myself whether the byte-at-a-time table could perform
Xcomparably to the "parallel addition" algorithm presented; so I wrote a
Xlittle benchmark code to compare. I feel (and folks are welcome to disagree
Xwith me here, obviously) that taking advantage of pointer type punning to
Xconvert from shift-and-mask extractions to bump-a-pointer-and-dereference is
Xlegitimate and (as far as I know!) portable, and not unreasonably ugly when
Xyou are already wading down in the bit-bashing.
X
XI did this using GCC 1.31 on a Sun 3/50 with 4M under SunOS 3.5 (no suntools
Xrunning). The machine wasn't quite idle; I had an rlogin to another machine
Xwhere I was reading netnews. However, it was close enough to idle that I
Xthink the user times are reasonable for comparison. I ran it several times
Xand the runs didn't vary by as much as a second in the user time. 1000000
Xiterations didn't have quite enough separation to convince me for sure this
Xwas a solid result, so I bumped it to 5000000.
X
XThe timing results came out:
X
XByte at a time table lookup: 75.0 user
X Parallel addition: 82.7 user
X
XSo, given this coding style, and this compiler technology, on this sort of
Xarchitecture, it appears that the byte-wise table lookup isn't particularly
Xslower. It also isn't importantly faster. I personally find it more
Xunderstandable.
X
XI compared the output for the first 1000 results (on the output-generating
Xversion) and they were identical, so I hope this code is correctly computing
Xthe desired function. It compiled with the options given in the Makefile
X(optization cranked way up, fairly strict diagnostics) without error or
Xwarning messages.
X
XNote that the generator program in the comment is really a one-shot;
XI built it with a library I run against that performs system and library
Xcall error checking for me, because I am extremely lazy. It should be clear
Xhow to extract that into a stand-alone.
X
XFinally, this is my first try ever at coding a timing benchmark, so it is
Xquite possible I have committed some classic benchmark blunder. I hope not.
XThe separately-compiled "consume" procedure might not have been absolutely
Xnecessary, but it kept me from having to examine assembler or figure out
Xwhat a heavy-duty optimizing compiler might be capable of omitting.
Shar_Eof
echo x - Makefile
sed 's/^X//' <<\Shar_Eof >Makefile
X#
X# Choose one of the following four OPTIONS definitions, which
X# cover generating 1000-point validation versions on each algorithm,
X# and generating 5000000-point silent versions for timing.
X#
X
X# OPTIONS=-DLIM=5000000 -DBENCHMARK -DBYTE_ORIENTED_TABLE
XOPTIONS=-DLIM=5000000 -DBENCHMARK -DPARALLEL_ADDITION
X# OPTIONS=-DLIM=1000 -DVERIFY -DBYTE_ORIENTED_TABLE
X# OPTIONS=-DLIM=1000 -DVERIFY -DPARALLEL_ADDITION
X
XOBJ=bench.o consume.o
X
XCC=gcc
XCFLAGS=-O -g -Wall -Wwrite-strings -msoft-float -fstrength-reduce \
X -finline-functions -I/usr/local/include $(OPTIONS)
X
Xbench : $(OBJ)
X $(CC) $(CFLAGS) -o $@ $(OBJ)
X
X$(OBJ) : Makefile
Shar_Eof
echo x - bench.c
sed 's/^X//' <<\Shar_Eof >bench.c
X#include <stdio.h>
X
X#ifndef LIM
X#define LIM 1000000
X#endif
X
Xvoid consume(int);
Xint bitcount(int);
X
Xint main() {
X int i;
X
X for (i=0; i<LIM; i++)
X consume(bitcount(i));
X
X return(0);
X}
X
X#ifdef BYTE_ORIENTED_TABLE
Xint bitcount(int num) {
X register unsigned char *ptr = (unsigned char *) #
X register int count = 0;
X /*
X * The following table was generated, of course. Here's the
X * (one-shot, quick-and-dirty) generating program (derived
X * from a program in
X * Harbison and Steele, "C: A Reference Manual", Second Edition
X * on page 175):
X *
X * #include <bent.h>
X *
X * char syntax_args[] = "";
X *
X * int main(int argc, char **argv) {
X * register unsigned i;
X *
X * progname = argv[0];
X * if (argc != 1)
X * syntax();
X *
X * for (i = 0; i<256; i++) {
X * register unsigned tmp = i;
X * register int count = 0;
X * while (tmp != 0) {
X * tmp ^= (tmp & -tmp);
X * count++;
X * }
X * eprintf("%d%s", count, (i%32 == 31) ? ",\n" : ",");
X * }
X * return(0);
X * }
X */
X static char bits_per_byte[] = {
X 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
X };
X
X /*
X * The following lines are the loop:
X *
X * for (counter = 0; counter < sizeof(int); counter++)
X * count += bits_per_byte[*ptr++];
X *
X * unrolled.
X */
X count += bits_per_byte[*ptr++];
X count += bits_per_byte[*ptr++];
X count += bits_per_byte[*ptr++];
X count += bits_per_byte[*ptr++];
X return(count);
X}
X#endif /* BYTE_ORIENTED_TABLE */
X
X#ifdef PARALLEL_ADDITION
X/*
X Module: l_bitcnt.c
X Author: Richard E. K. Neitzel
X
Xrevision history
X----------------
X1.0,9jan89,rekn written
X
X
X Explanation:
X This routine takes an integer and returns the number of bits it contains
X which are set. It does this via 'parallel' addition, turning the integer
X into sets of bit pairs of increasing size. WARNING! This assumes that
X integers are 32 bits!
X*/
X
X/*
X * Excised from netnews posting, and entry point renamed "bitcount";
X * changes made for ANSI C.
X *
X * In other words, this is Richard E. K. Neitzel's work, not mine, but
X * I have monkeyed with it, so don't blame him.
X * Bennett Todd
X */
X
Xint bitcount(int word) {
X register int j, k; /* Our work area */
X
X if (!word) /* Why bother? */
X return(0);
X
X j = word;
X
X k = j & 0x55555555; /* Clear every other bit */
X j >>= 1; /* Do again for cleared bits */
X j &= 0x55555555;
X j += k; /* 1st pairs done */
X
X k = j & 0x33333333; /* Clear every two bits */
X j >>= 2;
X j &= 0x33333333;
X j += k; /* 2nd pairs done */
X
X k = j & 0xffff; /* Only need last 4 sets */
X j &= 0xffff0000; /* and top 4 here */
X j = j >> 16; /* ready - */
X j += k; /* add 3rd pairs */
X
X k = j & 0xf0f; /* Clear bits */
X j >>= 4; /* Repeat */
X j &= 0xf0f;
X j += k; /* add 4th pairs */
X
X k = j & 0xff; /* Now add the 8 bit pairs */
X j >>= 8;
X j += k;
X
X return(j); /* bye */
X}
X#endif /* PARALLEL_ADDITION */
Shar_Eof
echo x - consume.c
sed 's/^X//' <<\Shar_Eof >consume.c
X#include <stdio.h>
X
Xint printf(const char *, ...);
X
Xvoid consume(int val) {
X
X#ifdef VERIFY
X (void) printf("%d\n", val);
X#endif /* VERIFY */
X
X#ifdef BENCHMARK
X /* Then we simply return.... */
X#endif /* BENCHMARK */
X
X}
Shar_Eof
exit 0
-Bennett
bet at orion.mc.duke.edu
More information about the Comp.lang.c
mailing list