SORT for the IBM PC, with lots of options
Bernie Roehl
broehl at watale.UUCP
Sun Aug 24 09:39:41 AEST 1986
This is a really good sorter for IBM PCs. It is originally from the Dr. Dobbs
Magazine. Have Fun !!!!!
# This is a shell archive. Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by watale!broehl on Sat Aug 23 19:31:42 EDT 1986
# Contents: read.me sort.c ssort.c
echo x - read.me
sed 's/^@//' > "read.me" <<'@//E*O*F read.me//'
These files make up an expanded version of the DOS utility SORT.
This posting also needs a subroutine out of the LIBRARY posting.
@//E*O*F read.me//
chmod u=rw,g=r,o=r read.me
echo x - sort.c
sed 's/^@//' > "sort.c" <<'@//E*O*F sort.c//'
/*
* SORT.C
*
* Copyright (c) 1986 Allen I. Holub
* All rights reserved.
*/
#define LINT_ARGS
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <malloc.h>
#include "getargs.h"
/*----------------------------------------------------------------------
* General purpose #defines.
*/
#define MAXBUF (132 + 1) /* Maximum input line length +1 */
#define MAXLINEC 1024 /* Maximum number of lines in */
/* an input file before merge */
/* files start to be created */
#define MAXTMP 18 /* The maximum number of temp- */
/* orary files that will be */
/* created. Two FILE ptrs. are */
/* needed for stdout, and */
/* stderr during output */
#define isnum(c1) ( isdigit(c1) || (c1) == '-' )
/*----------------------------------------------------------------------
* Variables for getargs. The immediately following variables will
* be modified by getargs() according to what flags it finds on the
* command line.
*/
static int Debug = 0 ; /* Debug Mode */
static int Verbose = 0 ; /* Say what you're doing */
static int Noblanks = 0 ; /* Blanks don't count */
static int Numeric = 0 ; /* Sort numbers by val */
static int Primary = 0 ; /* Primary sort key */
static int Secondary = 0 ; /* Secondary sort key */
static int Dictorder = 0 ; /* Use dictionary order */
static int Foldupper = 0 ; /* Fold upper case */
static int Reverse = 0 ; /* Sort in reverse order */
static int Delim = 32 ; /* Field separator */
static char *Mdir = "" ; /* Put temp files here */
static int Nodups = 0 ; /* Don't print duplicate */
/* lines. */
ARG Argtab[] =
{
{ 'b' , ARG_FBOOLEAN, &Noblanks, "ignore leading whitespace (Blanks)"},
{ 'd' , ARG_FBOOLEAN, &Dictorder, "sort in Dictionary order" },
{ 'f' , ARG_FBOOLEAN, &Foldupper, "Fold upper into lower case" },
{ 'n' , ARG_FBOOLEAN, &Numeric, "sort Numbers by numeric value" },
{ 'p' , ARG_INTEGER, &Primary, "use field <num> as Primary key" },
{ 'r' , ARG_FBOOLEAN, &Reverse, "do a reverse sort" },
{ 's' , ARG_INTEGER, &Secondary, "use field <num> as Secondary key" },
{ 't' , ARG_CHARACTER, &Delim, "use <C> to separate fields" },
{ 'u' , ARG_FBOOLEAN, &Nodups, "delete duplicate lines in output" },
{ 'v' , ARG_FBOOLEAN, &Verbose, "(Verbose) diagnostics to stderr" },
{ 'w' , ARG_STRING, (int *)&Mdir, "prepend <str> to Temp file names" },
{ 'z' , ARG_FBOOLEAN, &Debug, "Debug Mode" }
};
#define NUMARGS (sizeof(Argtab) / sizeof(ARG))
/*----------------------------------------------------------------------
* Global variables. The Lines array is used for the initial
* sorting.
*/
static int Options; /* Set by main if any options set */
static char *Lines[MAXLINEC]; /* Holds arrays of input lines */
static int Linec; /* # of items in Lines */
static char **Argv; /* Copies of argv and argc */
static int Argc;
/*----------------------------------------------------------------------
* The heap used in the merge process:
*/
typedef struct
{
char string[MAXBUF]; /* One line from the merge file */
FILE *file; /* Pointer to input file */
}
HEAP;
HEAP *Heap[ MAXTMP ]; /* The heap itself */
/*----------------------------------------------------------------------*/
pheap( str, n )
char *str;
{
int i;
printf("+--------------------------\n");
printf("| %s, heap is:\n", str);
for(i = 0; i < n ; i++ )
{
printf("|%02d: %s", i, *(Heap[i]->string) ?
Heap[i]->string : "(null)\n" );
}
printf("+--------------------------\n");
}
/*----------------------------------------------------------------------*/
int stoi(instr)
register char **instr;
{
/* Convert string to integer updating *instr to point
* past the number. Return the integer value represented
* by the string.
*/
register int num = 0 ;
register char *str ;
int sign = 1 ;
str = *instr;
if( *str == '-' )
{
sign = -1 ;
str++;
}
while( '0' <= *str && *str <= '9' )
num = (num * 10) + (*str++ - '0') ;
*instr = str;
return( num * sign );
}
/*----------------------------------------------------------------------*/
int dedupe(argc, argv)
int argc;
char **argv;
{
/* Compress an argv-like array of pointers to strings so that
* adjacent duplicate lines are eliminated. Return the
* argument count after the compression.
*/
register int i ;
int nargc ;
char **dp ;
nargc = 1;
dp = &argv[1];
for ( i=1 ; i < argc ; i++ )
{
if( strcmp(argv[i-1], argv[i]) != 0 )
{
*dp++ = argv[i];
nargc++;
}
}
return( nargc );
}
/*----------------------------------------------------------------------*/
static char *skip_field(n, str)
int n;
char *str;
{
/* Skip over n fields. The delimiter is in the global
* variable Delim. Return a pointer to either the character
* to the right of the delimiter, or to the '\0'.
*/
while( n > 0 && *str )
{
if( *str++ == Delim)
--n;
}
return(str);
}
/*----------------------------------------------------------------------
* Comparison functions needed for sorting.
*
* ssort() will call either argvcmp or qcmp, passing them pointers
* to linev entries. qcmp() calls two workhorse functions, qcmp1()
* and qcmp2(). The workhorse functions will also be called by the
* reheap() subroutine used to manipulate merge files.
*/
static int argvcmp( s1p, s2p )
char **s1p, **s2p;
{
return( strcmp( *s1p, *s2p ) );
}
/*----------------------------------------------------------------------*/
static int qcmp( str1p, str2p )
char **str1p;
char **str2p;
{
/* Takes care of all the sorting of fields, calling qcmp1
* to do the actual comparisons. Assuming here that
* Secondary won't be set unless Primary is set too.
*/
return( qcmp1( *str1p, *str2p ) );
}
/*----------------------------------------------------------------------*/
static int qcmp1( str1, str2 )
char *str1, *str2;
{
/* Workhorse comparison function. Takes care of sorting
* fields. If a primary sort field is specified then
* qcmp1() skips to that field and calls qcmp2 to do the
* actual comparison. If the primary fields are equal, then
* the secondary fields are compared in the same way.
*/
int rval;
if( !Primary )
return qcmp2( str1, str2 );
else
{
rval = qcmp2( skip_field(Primary - 1, str1),
skip_field(Primary - 1, str2) );
if( !rval && Secondary )
{
/* The two primary keys are equal, search the */
/* secondary keys if one is specified */
rval = qcmp2( skip_field(Secondary - 1, str1),
skip_field(Secondary - 1, str2) );
}
return rval;
}
}
/*----------------------------------------------------------------------*/
static int qcmp2( str1, str2 )
char *str1;
char *str2;
{
/* Workhorse comparison function. Deals with all command line
* options except fields. Returns
*
* 0 if str1 == str2
* positive if str1 > str2
* negative if str1 < str2
*
* This is a general purpose (and therefore relatively slow)
* routine. Use strcmp() if you need a fast compare.
* Comparison stops on reaching end of string or on encountering
* a Delim character (if one exists). So make sure Delim is
* set to '\0' if you're not sorting by fields.
*/
register unsigned int c1, c2;
if( Noblanks )
{
/*
* Skip past leading whitespace in both strings
*/
while( isspace(*str1) )
str1++;
while( isspace(*str2) )
str2++;
}
do
{
if( Numeric && isnum(*str1) && isnum(*str2) )
{
/* Add 0xff to the two numeric values so that
* c1 and c2 can't be confused with a Delim
* character later on.
*/
c1 = stoi( &str1 ) + 0xff ;
c2 = stoi( &str2 ) + 0xff ;
if( c1 == c2 )
continue;
else
break;
}
c1 = *str1++;
c2 = *str2++;
if(Dictorder)
{
/* Skip past any non-alphanumeric or blank
* characters.
*/
while( c1 && !isalnum(c1) )
c1 = *str1++ ;
while( c2 && !isalnum(c2) )
c2 = *str2++ ;
}
if(Foldupper)
{
/* Map c1 and c2 to upper case */
c1 = toupper( c1 );
c2 = toupper( c2 );
}
/* Keep processing while the characters are the same
* unless we've reached end of string or a delimiter.
*/
}
while( (c1==c2) && c1 && c1 != Delim );
if( Delim ) /* If we're sorting on a field */
{ /* and we've found a delimiter */
if(c1 == Delim) /* then map the delimiter to a */
c1 = 0; /* zero for purposes of */
/* comparison. */
if(c2 == Delim)
c2 = 0;
}
return( Reverse ? (c2 - c1) : (c1 - c2) );
}
/*----------------------------------------------------------------------*/
FILE *nextfile()
{
/* Return a FILE pointer for the next input file or NULL
* if no more input files exist (ie. all of the files
* on the command line have been processed) or a file
* can't be opened. In this last case print an error message.
* If Argc == 1 the first time we're called, then standard
* input is returned (the first time only , NULL is returned
* on subsequent calls).
*/
FILE *fp;
static int first_time = 1 ;
if( first_time )
{
first_time = 0;
if( Argc == 1 )
return stdin;
}
if( Argc-- > 1 )
{
if( !(fp = fopen(*++Argv, "r")) )
fprintf(stderr, "sort: can't open %s for read\n",
*Argv );
else if( Verbose )
fprintf(stderr, "sort: reading: %s\n", *Argv );
return fp;
}
return NULL;
}
/*----------------------------------------------------------------------*/
gtext()
{
/* Get text from the appropriate input source and put
* the lines into Linev, updating Linec. Return non-zero
* if more input remains. Note that non-zero will
* be returned if there are exactly MAXLINEC lines in
* the input, even though there isn't any more actual
* input available. If malloc can't get space for the line,
* we'll remember the line in buf and return 1.
*/
register int c;
static FILE *fp = 0;
static char buf[ MAXBUF ] = {0}; /* Input buffer */
int maxcount;
char **lv;
if( !fp ) /* This is only true the first */
fp = nextfile(); /* time we're called. */
lv = Lines ;
Linec = 0 ;
for( maxcount = MAXLINEC; --maxcount >= 0 ;)
{
if( !*buf )
while( fgets(buf, MAXBUF, fp) == NULL )
if( !(fp = nextfile()) )
return( 0 ); /* No more input */
if( *lv = malloc(strlen(buf) + 1) )
{
strcpy( *lv++, buf );
*buf = '\0';
Linec++;
}
else
return 1;
}
return( maxcount < 0 ); /* Return 1 if there's more input to get */
}
/*----------------------------------------------------------------------*/
char *fname( num )
{
/* Return a merge file name for the indicated merge pass.
*/
static char name[ 16 ];
if( num > MAXTMP )
{
fprintf(stderr,"sort: input file too large\n" );
exit(1);
}
sprintf(name, "%smerge.%d", Mdir, num );
return( name );
}
/*----------------------------------------------------------------------*/
outtext( passnum, more_to_go )
{
/* Print out all the strings in the Lines array and free all
* the memory that they use. Output is sent to standard
* output if this is pass 1 and there's no more input
* to process, otherwise output is sent to a merge file.
*/
register char **lv;
register FILE *fp;
if( passnum == 1 && !more_to_go )
fp = stdout;
else if( !(fp = fopen( fname(passnum), "w" )) )
{
fprintf(stderr,"Can't open merge file %s for write\n",
fname(passnum));
exit(1);
}
else if( Verbose )
fprintf(stderr,"sort: creating: %s\n", fname(passnum));
for( lv = Lines ; --Linec >= 0; )
{
fputs( *lv, fp );
free ( *lv++ );
}
fclose( fp );
}
/*----------------------------------------------------------------------*/
open_mergefiles( nfiles )
{
/* Open all the merge files and create the heap. "nfiles"
* merge-files exist and the heap will have that many
* elements in it. The heap is unsorted on exit.
*/
HEAP **hp;
int i;
for( hp = Heap, i = nfiles; i > 0; hp++, --i )
{
if( ! (*hp = (HEAP *) malloc(sizeof(HEAP))) )
{
fprintf( stderr, "sort: out of memory!" );
exit( 1 );
}
if( !( (*hp)->file = fopen( fname(i), "r") ))
{
fprintf(stderr,"sort: can't open %s for read",
fname(i) );
exit( 1 );
}
if( !fgets( (*hp)->string, MAXBUF, (*hp)->file ) )
{
fprintf(stderr,"sort: merge file %s is empty",
fname(i) );
exit( 1 );
}
if( Verbose )
fprintf(stderr, "sort: merging: %s\n", fname(i) );
}
}
/*----------------------------------------------------------------------*/
mcmp( hpp1, hpp2 )
HEAP **hpp1, **hpp2;
{
/* Comparison routine for sorting the heap. Is passed
* two pointers to HEAP pointers and compares the
* string fields of these using the same workhorse
* functions used in the initial sorting phase.
*/
return Options ? qcmp1 ((*hpp1)->string, (*hpp2)->string)
: strcmp ((*hpp1)->string, (*hpp2)->string)
;
}
/*----------------------------------------------------------------------*/
reheap( nfiles )
{
/* Reheap the Heap, assume that the first element (**Heap)
* is the newly added one.
*/
register int parent, child;
HEAP *tmp;
for( parent = 0, child = 1; child < nfiles; )
{
/* Find the smaller child. Then if the parent is less
* than the smaller child, we're done. Otherwise
* swap the parent and child, and continue the
* reheaping process with a new parent.
*/
if( child+1 < nfiles ) /* if child+1 is in the heap */
if( mcmp(&Heap[child], &Heap[child+1]) > 0)
child++;
if( mcmp( &Heap[parent], &Heap[child]) <= 0)
break;
tmp = Heap[parent]; /* Exchange */
Heap[parent] = Heap[child ];
Heap[child ] = tmp;
parent = child;
child = parent << 1; /* child = parent * 2 */
}
}
/*----------------------------------------------------------------------*/
merge( nfiles )
int nfiles; /* Number of merge files */
{
open_mergefiles( nfiles );
ssort( Heap, nfiles, sizeof(Heap[0]), mcmp );
while( nfiles > 0 )
{
if( Debug )
pheap( "Merge: top of while loop", nfiles );
fputs( (*Heap)->string, stdout );
if( !fgets((*Heap)->string, MAXBUF, (*Heap)->file) )
{
/* This input stream is exhausted. Reduce the
* heap size to compensate. Note that Heap+1
* is the same as &Heap[1];
*/
fclose( (*Heap)->file );
if( --nfiles )
memcpy(Heap, Heap+1, nfiles * sizeof(HEAP));
}
reheap( nfiles );
}
}
/*----------------------------------------------------------------------*/
adjust_args()
{
/* Adjust various default arguments to fix mistakes made
* on the command line. In particular Delim is always 0
* unless either Primary or Secondary was set.
* If a secondary field is specified without a Primary, then
* 1 is assumed for the primary. If no Delim is specified
* then tab (\t) is assumed. "Options" is true if any of
* the options that affect the sort order were specified
* on the command line.
*/
if( !(Primary || Secondary) )
Delim = 0;
else
{
/* if( !Delim )
Delim = ' ';*/
if( !Primary )
Primary = 1;
}
Options = Noblanks || Numeric || Dictorder || Foldupper
|| Reverse || Delim;
}
/*----------------------------------------------------------------------*/
void usage()
{
fprintf( stderr, "Usage: sort [%c[bdfnruv][p<num>|s<num>][t<c>][w<str>]] [files]...\n", ARG_Switch );
fprintf( stderr, "\n");
fprintf( stderr, "Set the environment variable SWITCHAR to the character\n");
fprintf( stderr, "you wish to use for the Switch Character\n");
fprintf( stderr, "\n" );
fprintf( stderr, "Case of the command line switches %s important\n",
ARG_ICase ? "is not" : "is" );
fprintf( stderr, "\n" );
}
/*----------------------------------------------------------------------*/
main(argc, argv)
int argc;
char **argv;
{
int numpasses = 0; /* Number of merge files used */
int more_input; /* True if input isn't exhausted */
ctlc();
ARG_ICase = 1;
Argc = getargs( argc, argv, Argtab, NUMARGS );
Argv = argv;
adjust_args();
do{
more_input = gtext();
if( Linec )
{
ssort(Lines, Linec, sizeof(*Lines),
Options ? qcmp : argvcmp);
if( Nodups )
Linec = dedupe(Linec, Lines);
outtext( ++numpasses, more_input );
}
} while( more_input );
if( numpasses > 1 ) /* merge files were created */
{
fclose( stdin ); /* Free up default file des- */
fclose( stdaux ); /* criptors for unused streams */
fclose( stdprn ); /* so that they can be used for */
/* merge files. */
merge( numpasses );
for(; numpasses > 0 ; --numpasses )
{
unlink( fname(numpasses) );
if( Verbose )
fprintf(stderr, "sort: deleted: %s\n",
fname(numpasses));
}
}
exit(0);
}
@//E*O*F sort.c//
chmod u=rw,g=r,o=r sort.c
echo x - ssort.c
sed 's/^@//' > "ssort.c" <<'@//E*O*F ssort.c//'
/* SSORT.C Works just like qsort() except that a shell
* sort, rather than a quick sort, is used. This
* is more efficient than quicksort for small numbers of elements
* and it's not recursive so will use much less stack space.
* This routine started out as the one in K&R.
*
* 12/13/85: Modified to select a gap from the series
* 1, 4, 13, 40, 121 ... as per Knuth.
*
* Copyright (C) 1985, Allen I. Holub. All rights reserved
*/
void ssort( base, nel, width, cmp )
char *base;
int nel, width;
int (*cmp)();
{
register int i, j;
int gap, k, tmp ;
char *p1, *p2;
for( gap=1; gap <= nel; gap = 3*gap + 1 )
;
for( gap /= 3; gap > 0 ; gap /= 3 )
for( i = gap; i < nel; i++ )
for( j = i-gap; j >= 0 ; j -= gap )
{
p1 = base + ( j * width);
p2 = base + ((j+gap) * width);
if( (*cmp)( p1, p2 ) <= 0 )
break;
for( k = width; --k >= 0 ;)
{
tmp = *p1;
*p1++ = *p2;
*p2++ = tmp;
}
}
}
#ifdef DEBUG
cmp( cpp1, cpp2 )
char **cpp1, **cpp2;
{
register int rval;
printf("comparing %s to %s ", *cpp1, *cpp2 );
rval = strcmp( *cpp1, *cpp2 );
printf("returning %d\n", rval );
return rval;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
main( argc, argv )
int argc;
char **argv;
{
ssort( ++argv, --argc, sizeof(*argv), cmp );
while( --argc >= 0 )
printf("%s\n", *argv++ );
}
#endif
@//E*O*F ssort.c//
chmod u=rw,g=r,o=r ssort.c
echo Inspecting for damage in transit...
temp=/tmp/shar$$; dtemp=/tmp/.shar$$
trap "rm -f $temp $dtemp; exit" 0 1 2 3 15
cat > $temp <<\!!!
4 23 132 read.me
697 2600 16690 sort.c
71 300 1409 ssort.c
772 2923 18231 total
!!!
wc read.me sort.c ssort.c | sed 's=[^ ]*/==' | diff -b $temp - >$dtemp
if [ -s $dtemp ]
then echo "Ouch [diff of wc output]:" ; cat $dtemp
else echo "No problems found."
fi
exit 0
--
Michael A. Shiels
clyde-\
decvax-\\
ihnp4-\\\
+++-----> watmath!watale!broehl
tektronix-///
ubc-vision-//
utzoo-/
More information about the Comp.sources.unix
mailing list