Tool to find duplicate articles
Jerry Aguirre
jerry at olivey.olivetti.com
Fri Aug 17 05:06:16 AEST 1990
Here is a tool I thru together when my news history got corrupted and
users started complaining about seeing duplicates of articles.
===BEGIN histdups.c===
#include <stdio.h>
#define LINESIZ 1024
#define MAXF 32
/* Expects the stdin to be the history file, sorted. Stdout is a list
* of file names which are duplicates of earlier articles. Run after
* expire -r and then "rm" the files listed in the output.
*
* sort <history | histdups >dupfiles; xargs <dupfiles rm
*
* If the news history becomes corrupted then you can wind up with
* duplicates. These are both a waste of space and a pain for people
* reading news.
*
* B news expire -r will find the dups and then enter all of them into
* the history file. (It doesn't even match up the cross postings
* to each other correctly.) This program will output the names of all
* but the first duplicate in each news group. (Where "first" is based
* on article numbering which presumably represents arrival order.)
*
* 16Aug90 Jerry Aguirre <jerry at atc.olivetti.com>
*/
char files[MAXF][LINESIZ];
int nf;
long atol();
char *index();
main()
{
char c, *p;
int i, j;
char line[LINESIZ];
char id[LINESIZ];
char lastline[LINESIZ];
nf = 0;
id[0] = '\0';
lastline[0] = '\0';
while (gets(line)) {
p = index(line, '\t');
if (p) {
*p = '\0';
if (strcmp(line, id) == 0) { /* we have a dup */
if (lastline[0] != '\0') {
parsefiles(lastline);
lastline[0] = '\0';
}
*p = '\t';
parsefiles(line);
} else {
printdups();
strcpy(id, line);
*p = '\t';
strcpy(lastline, line);
nf = 0;
}
}
}
}
parsefiles(line) char *line;
{
char *pd, *pf, *p;
pd = index(line, '\t');
if (pd) pd++;
else return;
pf = index(pd, '\t');
if (pf) pf++;
else return;
while (*pf) {
while (*pf == ' ') pf++;
if (*pf == '\0') return;
if (nf >= MAXF) return;
p = index(pf, ' ');
if (p) *p = '\0';
strcpy(files[nf], pf);
nf++;
if (p) {
pf = p + 1;
*p = ' ';
}
else return;
}
}
printdups()
{
int i1, i2, flags[MAXF];
long n1, n2;
char *p1, *p2;
for (i1 = 0; i1 < nf; i1++) flags[i1] = 0;
for (i1 = 0; i1 < nf; i1++) {
p1 = index(files[i1], '/');
if (!p1) continue;
*p1 = '\0';
n1 = atol(p1+1);
for (i2 = i1 + 1; i2 < nf; i2++) {
p2 = index(files[i2], '/');
if (!p2) continue;
*p2 = '\0';
if (strcmp(files[i1], files[i2]) == 0) { /* same group */
n2 = atol(p2+1);
if (n2 > n1) flags[i2] = 1; /* lowest number stays */
else if (n2 < n1) flags[i1] = 1;
}
*p2 = '/';
n2 = atol(p2+1);
}
*p1 = '/';
}
for (i1 = 0; i1 < nf; i1++) {
if (flags[i1] == 1) {
for (p1 = files[i1]; *p1; p1++) {
if (*p1 == '.') putchar('/');
else putchar(*p1);
}
putchar('\n');
}
}
}
===END histdups.c===
More information about the Alt.sources
mailing list