new survey to supplement arbitron. Please run this program.

Fri May 19 16:47:24 AEST 1989

I'm escalating my network measurement surveys. below is a short
C program that analyzes flow paths and reports them to a central location.

It has been alpha-tested at 25 sites, including Vax, Sun 3, Sun 4, Pyramid,
and some 3B machines. There may well be bugs, but I guarantee that if there
are, that they will be obscure and difficult bugs. 

This program takes about as long as "expire -r" to run, so it's not something
you want to do very often. Here at decwrl where we keep 45 or 50 days of news
online, it can take several hours to run. On smaller machines which keep only
a few days of news online, it finishes in several minutes.

Ideally I'd like everybody to run this program once a month and mail the
results to pathsurvey at decwrl.dec.com. It's important that small sites as well
as big ones participate.

Thank you. Once the data starts rolling in, I'll be posting the results once
a month, just like arbitron.

	Brian Reid
	DEC Western Research

------------------------------------

/* inpaths.c -- track the paths of incoming news articles and prepare
 *	      in a format suitable for decwrl pathsurveys
 *
 *
 * This program inputs a list of filenames of news articles, and outputs a
 * data report which should be mailed to the decwrl Network Monitoring
 * Project at address "pathsurvey at decwrl.dec.com".
 *
 *
 * Run it like this:
 *
 *  cd /usr/spool/news
 *  find . -type f -print | inpaths "yourhost" | mail pathsurvey at decwrl.dec.com
 *
 *  where "yourhost" is the host name of your computer.
 *
 * If you have a huge amount of news spooled and don't want to run 
 * all of it through inpaths, you can do something like
 *
 *   find . -type f -mtime -10 -print | ...
 * 
 * there are 3 options: -s, -m, and -l for short, medium, and long report.
 * The default is to produce a long report. If you are worried about mail
 * expenses you can send a shorter report. The long report is typically
 * about 50K bytes for a major site, and perhaps 25K bytes for a smaller
 * site. 
 *
 * Brian Reid
 *	V1.0	 Sep 1986
 *	V2.0	 May 1989
 *     
 */

#define VERSION "2.2"
#include <stdio.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/types.h>

#define SURVEYPERIOD 21		/* Maximum number of days in survey period */
#define	INTERVAL	SURVEYPERIOD*60*60*24
#define HEADBYTES 1024

main (argc,argv)
  int argc;
  char **argv;
 {
    char linebuf[1024], jc, *lptr, *cp, *cp1, *cp2;
    char rightdelim;
    char *pathfield;
    char artbuf[HEADBYTES];
    char * scanlimit;
    char *hostname;
    char hostString[128];
    int needHost;
    static int passChar[256];
    int isopen,columns,verbose,totalTraffic;

	/* definitions for getopt */
    extern int optind;
    extern char *optarg;

 /* structure used to tally the traffic between two hosts */
    typedef struct trec {
	struct trec *rlink;
	struct nrec *linkid;
	int tally;
    } ;

 /* structure to hold the information about a host */
    typedef struct nrec {
	struct nrec *link;
	struct trec *rlink;
	char *id;
	long sentto; /* tally of articles sent to somebody from here */
    } ;
    struct nrec *hosthash[128], *hnptr, *list, *relay;
    struct trec *rlist;
    int i, article, gotbytes, c;
    extern errno;

    hostname = "unknown";
    verbose = 2;
    while (( c=getopt(argc, argv, "sml" )) != EOF)
    switch (c) {
	case 's': verbose=0; break;
	case 'm': verbose=1; break;
	case 'l': verbose=2; break;
	case '?': fprintf(stderr,
	"usage: %s [-s] [-m] [-l] hostname\n",argv[0]);
	exit(1);
    }
    if (optind < argc) {
        hostname = argv[optind];
    } else {
	fprintf(stderr,"usage: %s [-s] [-m] [-l] `hostname`\n",argv[0]);
	exit(1);
    }

    fprintf(stderr,"computing %s inpaths for host %s\n",
	verbose==0 ? "short" : (verbose==1 ? "medium" : "long"),hostname);
    for (i = 0; i<128; i++) hosthash[i] = (struct nrec *) NULL;

/* precompute character types to speed up scan */
    for (i = 0; i<=255; i++) {
    	passChar[i] = 0;
	if (isalpha(i) || isdigit(i)) passChar[i] = 1;
	if (i == '-' || i == '.' || i == '_') passChar[i] = 1;
    }
    totalTraffic = 0;    

    while (gets(linebuf) != NULL) {
        lptr = linebuf;
	isopen = 0;

/* Skip files that do not have pure numeric names */
	i = strlen(lptr)-1;
	do {
	    if (!isdigit(linebuf[i])) {
	        if (linebuf[i]=='/') break;
		goto bypass;
	    }
	    i--;
	} while (i>=0);

/* Open the file for reading */
	article = open(lptr, O_RDONLY);
	isopen = (article > 0);

/* Read in the first few bytes of the article; find the end of the header */
	gotbytes = read(article, artbuf, HEADBYTES);
	if (gotbytes < 10) goto bypass;

/* Find "Path:" header field */
	pathfield = (char *) 0;
	scanlimit = &artbuf[gotbytes];
	for (cp=artbuf; cp <= scanlimit; cp++) {
	    if (*cp == '\n') break;
	    if (pathfield) break;
	    if (strncmp(cp, "Path: ", 6) == 0) {
		pathfield = cp; goto gotpath;
	    }
	    while (*cp != '\n' && cp <= scanlimit) cp++;
	}
	fprintf(stderr,"%s: didn't find 'Path:' in 1st %d bytes.\n",
	    lptr,HEADBYTES);
	goto bypass; 

gotpath: ;

/* Extract all of the host names from the "Path:" field and put them in our
host table.								 */
	cp = pathfield;
	while (*cp != NULL && *cp != '\n') cp++;
	if (cp == NULL) {
	    fprintf(stderr,"%s: end of Path line not in buffer.\n",lptr);
	    goto bypass;
	}

	totalTraffic++;
	*cp = 0;
	pathfield += 5;	/* skip 'Path:' */
	cp1 = pathfield;
	relay = (struct nrec *) NULL;
	rightdelim = '!';
	while (cp1 < cp) {
	    /* get next field */
	    while (*cp1=='!') cp1++;
	    cp2 = ++cp1;
	    while (passChar[(int) (*cp2)]) cp2++;

	    rightdelim = *cp2; *cp2 = 0;
	    if (rightdelim=='!' && *cp1 != (char) NULL) {
	    /* see if already in the table */
		list = hosthash[*cp1];
		while (list != NULL) {
		    /*
		     * Attempt to speed things up here a bit.  Since we hash
		     * on the first char, we see if the second char is a match
		     * before calling strcmp()
		     */
		    if (list->id[1] == cp1[1] && !strcmp(list->id, cp1)) {
			hnptr = list;
			break;		/* I hate unnecessary goto's */
		    }
		    list = list->link;
		}
		if(list == NULL) {
			/* get storage and splice in a new one */
			hnptr = (struct nrec *) malloc(sizeof (struct nrec));
			hnptr->id = (char *) strcpy(malloc(1+strlen(cp1)),cp1);
			hnptr->link = hosthash[*cp1];
			hnptr->rlink = (struct trec *) NULL;
			hnptr->sentto = (long) 0;
			hosthash[*cp1] = hnptr;
		}
	    }
/* 
At this point "hnptr" points to the host record of the current host. If
there was a relay host, then "relay" points to its host record (the relay
host is just the previous host on the Path: line. Since this Path means
that news has flowed from host "hnptr" to host "relay", we want to tally
one message in a data structure corresponding to that link. We will
increment the tally record that is attached to the source host "hnptr".
*/

	    if (relay != NULL && relay != hnptr) {
		rlist = relay->rlink;
		while (rlist != NULL) {
		    if (rlist->linkid == hnptr) goto have2;
		    rlist = rlist->rlink;
		}
		rlist = (struct trec *) malloc(sizeof (struct trec));
		rlist->rlink = relay->rlink;
		relay->rlink = rlist;
		rlist->linkid = hnptr;
		rlist->tally = 0;

    have2:      rlist->tally++;
		hnptr->sentto++;
	    }

	    cp1 = cp2;
	    relay = hnptr;
	    if (rightdelim == ' ' || rightdelim == '(') break;
	}
bypass: if (isopen) close(article) ;
    }
/* Now dump the host table */
    printf("ZCZC begin inhosts %s %s %d %d %d\n",
    	VERSION,hostname,verbose,totalTraffic,SURVEYPERIOD);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (list->rlink != NULL) {
		if (verbose > 0 || (100*list->sentto > totalTraffic))
		    printf("%d\t%s\n",list->sentto, list->id);
	    }
	    list = list->link;
	}
    }
    printf("ZCZC end inhosts %s\n",hostname);

    printf("ZCZC begin inpaths %s %s %d %d %d\n",
        VERSION,hostname,verbose,totalTraffic,SURVEYPERIOD);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (verbose > 1 || (100*list->sentto > totalTraffic)) {
		if (list->rlink != NULL) {
		    columns = 3+strlen(list->id);
		    sprintf(hostString,"%s H ",list->id);
		    needHost = 1;
		    rlist = list->rlink;
		    while (rlist != NULL) {
		        if (
			     (100*rlist->tally > totalTraffic)
			  || ((verbose > 1)&&(5000*rlist->tally>totalTraffic))
			   ) {
			    if (needHost) printf("%s",hostString);
			    needHost = 0;
			    relay = rlist->linkid;
			    if (columns > 70) {
				printf("\n%s",hostString);
				columns = 3+strlen(list->id);
			    }
			    printf("%d Z %s U ", rlist->tally, relay->id);
			    columns += 9+strlen(relay->id);
			}
			rlist = rlist->rlink;
		    }
		    if (!needHost) printf("\n");
		}
	    }
	    list = list->link;
	}
    }
    printf("ZCZC end inpaths %s\n",hostname);
    fclose(stdout);
    exit(0);
}