v08i092: fast & simple sorting program
Brandon S. Allbery - comp.sources.misc
allbery at uunet.UU.NET
Sun Oct 8 11:19:12 AEST 1989
Posting-number: Volume 8, Issue 92
Submitted-by: ok at cs.mu.oz.au.UUCP (Richard O'Keefe)
Archive-name: nsort
#!/bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #!/bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
# README
# makefile
# nsort.1
# nsort.c
cat >README <<'------ EOF ------'
nsort is a totally stripped down sort(1); it reads a file into memory,
sorts it using merge sort, and writes the result out. I wrote it to
show that merge sort is fast -- sort(1) is documented as using a
version of quicksort internally, and nsort beats it easily even when
sort(1) is given enough memory to sort in memory.
To compile it, just run make(1).
------ EOF ------
ls -l README
cat >makefile <<'------ EOF ------'
nsort:
cc -o nsort -O nsort.c
------ EOF ------
ls -l makefile
cat >nsort.1 <<'------ EOF ------'
.TH NSORT 1
.\"/*%nroff -man %
.SH NAME
nsort \*- fast and simple sorting program
.SH SYNOPSIS
.B nsort
<in >out
.SH DESCRIPTION
The
.B nsort
command
reads lines from its standard input, sorts them, and writes the
sorted lines to its standard output. It behaves just like
sort(1) when the latter is given no arguments. The only reason
for using
.B nsort
is speed: when it can be used it is up to twice as fast as sort(1).
.PP
This program does
.I not
accept file names as arguments. To process more than
one input file, pipe the output of cat(1) into
.B nsort.
.PP
The program does not need to know how many lines there
are in the standard input, nor does it need to read the
standard input more than once. However, the data must
fit into memory.
.SH OPTIONS
The
.B nsort
command has no options.
.SH EXAMPLES
.IP % 3
ls -f . | nsort
.PP
has the same effect as a plain `ls'.
.SH DIAGNOSTICS
An error message is printed on stderr and the program halts without
producing any output if
.PP
there are any arguments on the command line (status EX_USAGE is
returned in this case), or
.PP
the program is unable to obtain enough memory using malloc (if the
file has N lines and C characters, roughly 8N+C bytes are needed)
(status EX_SOFTWARE is returned in this case), or
.PP
something goes wrong while reading standard input
(status EX_IOERR is returned in this case).
.PP
All error messages indicate the actual name the program was invoked by.
If an error message is produced, the rest of the standard input is not
read, so you are likely to get "Broken pipe" messages from upstream
processes.
.SH BUGS
Input lines which contain NUL characters are quietly truncated.
sort(1) cannot handle such lines either, but complains.
.SH "SEE ALSO"
sort(1)
.SH AUTHOR
Richard A. O'Keefe
------ EOF ------
ls -l nsort.1
cat >nsort.c <<'------ EOF ------'
/* File : nsort.c
Author : Richard A. O'Keefe
Updated: 1988
Purpose: Fast sorting command for smallish files.
This program has no copyright notice. You can do anything you please
with it, except blame me if it doesn't work.
*/
#ifndef lint
static char SCCSid[] = "%Z%%E% %M% %I%";
#endif/*lint*/
/* nsort <input >output
is a very simple sorting program which has been written to be as fast
as possible. It is equivalent to sort with no arguments. That is,
it sorts its standard input to its standard output, and compares
entire lines.
It uses a natural merge, so if the input is already in order it takes
linear time, and it reads the entire file into memory, using a single
read() if the standard input is a regular file.
Sorting random permutations of /usr/dict/words on a Sun-3/50:
sort(1) : 15 seconds nsort : 8 seconds (with SCSI disc)
17 seconds 11 seconds (NFS over Ethernet)
*/
#include <stdio.h>
/* The following values are taken from <sysexits.h>, but are copied here
in case you lack that BSD header file.
*/
#define EX_OK 0 /* nothing went wrong */
#define EX_USAGE 64 /* something wrong with the command line */
#define EX_SOFTWARE 70 /* internal error (here, memory ran out) */
#define EX_IOERR 71 /* transput error (here, read() trouble) */
#define STDIN 0
extern char * malloc();
extern char * strcpy();
extern int strcmp();
extern int strlen();
extern void perror();
extern int lseek();
extern int read();
typedef struct item_rec *item_ptr;
struct item_rec
{
item_ptr next;
char* data;
};
#define IN_ORDER(x, y) (strcmp((x)->data, (y)->data) <= 0)
item_ptr nat_sort(list)
item_ptr list;
{
item_ptr stack[30];
item_ptr *sp = stack;
int runs = 0; /* total number of runs processed */
register item_ptr p, q, r;
struct item_rec header;
int k;
while (p = list) {
/* pick up a run from the front of list, setting */
/* p = (pointer to beginning of run), list = (rest of list) */
if (q = p->next) {
list = q->next;
if (!IN_ORDER(p, q)) r = q, q = p, p = r;
p->next = q;
for (r = list; r && IN_ORDER(q, r); r = r->next)
q->next = r, q = r;
q->next = (item_ptr)0;
list = r;
} else {
list = (item_ptr)0;
}
runs++;
/* merge this run with 0 or more runs off the top of the stack */
for (k = runs; 1 &~ k; k >>= 1) {
q = *--sp;
r = &header;
while (q && p)
if (IN_ORDER(q, p)) {
r->next = q;
r = q, q = q->next;
} else {
r->next = p;
r = p, p = p->next;
}
r->next = q ? q : p;
p = header.next;
}
/* push the merged run onto the stack */
*sp++ = p;
}
if (sp == stack) return (item_ptr)0;
/* merge all the runs on the stack */
p = *--sp;
while (sp != stack) {
q = *--sp;
r = &header;
while (q && p)
if (IN_ORDER(q, p)) {
r->next = q;
r = q, q = q->next;
} else {
r->next = p;
r = p, p = p->next;
}
r->next = q ? q : p;
p = header.next;
}
return p;
}
item_ptr alloc_item(data)
char *data;
{
register item_ptr result;
result = (item_ptr)malloc(sizeof *result + strlen(data) + 1);
if (result) {
result->next = (item_ptr)0;
result->data = strcpy((char*)result + sizeof *result, data);
}
return result;
}
/* What we really want to do is to slurp the entire file in with one call
to read(). In order to do that, we need to know how much there is. A
file's size can be determined either by calling fstat() or by using an
lseek() to the end. The number you get from fstat() doesn't mean much
for pipes and terminals, so lseek() appears to be the better approach.
Note that even when stdin is connected to a file, part of it may have
been read already. In that case, we should not include the part which
has been read. So we do
i = lseek(STDIN, 0, 1)
to discover the current position (and simultaneously to find out if we
can use lseek() at all on this file descriptor). One problem remains;
the size of the file may change while we are reading it. In that case
we'll stop early if we have to, but if the file grows we'll miss stuff
added at the end.
*/
main(argc, argv)
int argc;
char **argv;
{
struct item_rec header;
item_ptr p, q;
char *progname = argc >= 1 ? argv[0] : "nsort";
int i;
if (argc != 1) {
fprintf(stderr, "Usage: %s <unordered >sorted\n", progname);
exit(EX_USAGE);
}
i = lseek(STDIN, 0, 1); /* current position in stream */
if (i < 0) { /* can't find out the size by seeking */
char buffer[1024];
for (p = &header
; fgets(buffer, sizeof buffer, stdin)
; p->next = q, p = q) {
i = strlen(buffer);
if (i > 0 && buffer[i-1] == '\n') buffer[i-1] = '\0';
if (!(q = alloc_item(buffer))) {
fprintf(stderr, "%s: ran out of memory\n", progname);
exit(EX_SOFTWARE);
}
} else {
register char *b, *c;
register int n;
n = lseek(STDIN, 0, 2) - i; /* part of stdin may have been read */
(void) lseek(STDIN, i, 0); /* go back to original position */
if (!(b = malloc(n+1))) {
fprintf(stderr, "%s: ran out of memory\n", progname);
exit(EX_SOFTWARE);
}
for (c = b; (i = n-(c-b)) > 0; c += i) {
i = read(STDIN, c, i);
if (i < 0) {
perror(progname);
exit(EX_IOERR);
} else
if (i == 0) {
break;
}
}
/* it is possible that the file may have been truncated */
/* while we were reading it. */
n = c-b;
for (p = &header; n > 0; b = c, p->next = q, p = q) {
for (c = b; --n >= 0; c++)
if (*c == '\n') {
*c++ = '\0';
break;
}
if (n < 0) *c = '\0';
if (!(q = (item_ptr) malloc(sizeof *q))) {
fprintf(stderr, "nsort: ran out of memory\n");
exit(EX_SOFTWARE);
}
q->data = b;
}
}
p->next = (item_ptr)0;
p = header.next;
for (p = nat_sort(p); p; p = p->next)
puts(p->data);
exit(EX_OK);
}
------ EOF ------
ls -l nsort.c
exit 0
More information about the Comp.sources.misc
mailing list