v16i095: finddup - Find duplicate files, Part01/01
Bill Davidsen
davidsen at crdos1.crd.ge.com
Fri Feb 8 05:52:40 AEST 1991
Submitted-by: davidsen at crdos1.crd.ge.com (Bill Davidsen)
Posting-number: Volume 16, Issue 95
Archive-name: finddup/part01
This is a program I wrote to help a friend sort through 300MB of GIF
files, many of which were duplicates with different names, like
"redwood.gif" and "bigtree.gif."
It does a fairly fast and very complete scan for duplicates, and
identifies hard links as well. Files with duplicate length are checked
by CRC, and then if they still match a byte by byte compare is done.
This seems to be the minimum which will find all duplicates while
avoiding any possibility of a false match.
Bill
---------------- Cut here --------------------
#!/bin/sh
# shar: Shell Archiver (v1.29)
#
# Run the following text with /bin/sh to create:
# finddup.c
# finddup.1C
# makefile
#
echo "x - extracting finddup.c (Text)"
sed 's/^X//' << 'SHAR_EOF' > finddup.c &&
X/*****************************************************************
X | finddup.c - the one true find duplicate files program
X |----------------------------------------------------------------
X | Bill Davidsen, last hacked 2/5/91
X | Copyright (c) 1991 by Bill Davidsen, all rights reserved. This
X | program may be freely used and distributed in its original
X | form. Modified versions must be distributed with the original
X | unmodified source code, and redistribution of the original code
X | or any derivative program may not be restricted.
X |----------------------------------------------------------------
X | Calling sequence:
X | finddup checklist
X |
X | where checklist is the name of a file containing filenames to
X | be checked, such as produced by "find . -type f -print >file"
X | returns a list of linked and duplicated files.
X ****************************************************************/
X
X#include <stdio.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X#include <malloc.h>
X
X/* parameters */
X#define MAXFN 120 /* max filename length */
X
X/* constants */
X#define EOS ((char) '\0') /* end of string */
X#define FL_CRC 0x0001 /* flag if CRC valid */
X#define FL_DUP 0x0002 /* files are duplicates */
X#define FL_LNK 0x0004 /* file is a link */
X
X/* macros */
X#ifdef DEBUG
X#define debug(X) printf X
X#else
X#define debug(X)
X#endif
X#define SORT qsort((char *)filelist, n_files, sizeof(filedesc), comp1);
X#define GetFlag(x,f) ((filelist[x].flags & (f)) != 0)
X#define SetFlag(x,f) (filelist[x].flags |= (f))
X
Xtypedef struct {
X off_t length; /* file length */
X unsigned long crc32; /* CRC for same length */
X dev_t device; /* physical device # */
X ino_t inode; /* inode for link detect */
X off_t nameloc; /* name loc in names file */
X char flags; /* flags for compare */
X} filedesc;
X
Xfiledesc *filelist; /* master sorted list of files */
Xlong n_files = 0; /* # files in the array */
Xlong max_files = 0; /* entries allocated in the array */
XFILE *namefd; /* file for names */
X#ifndef lint
Xstatic char *SCCSid[] = {
X "@(#)finddup.c v1.10 - 2/5/91",
X "Copyright (c) 1991 by Bill Davidsen, all rights reserved"
X};
X#endif
X
Xint comp1(); /* compare two filedesc's */
Xvoid scan1(); /* make the CRC scan */
Xvoid scan2(); /* do full compare if needed */
Xvoid scan3(); /* print the results */
Xunsigned long get_crc(); /* get crc32 on a file */
Xchar *getfn(); /* get a filename by index */
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X char curfile[MAXFN];
X struct stat statbuf;
X off_t loc; /* location of name in the file */
X int zl_hdr = 1; /* need header for zero-length files list */
X filedesc *curptr; /* pointer to current storage loc */
X
X /* check for filename given, and open it */
X if (argc != 2) {
X fprintf(stderr, "Needs name of file with filenames\n");
X exit(1);
X }
X namefd = fopen(argv[1], "r");
X if (namefd == NULL) {
X perror("Can't open names file");
X exit(1);
X }
X
X /* start the list of name info's */
X filelist = (filedesc *) malloc(50 * sizeof(filedesc));
X if (filelist == NULL) {
X perror("Can't start files vector");
X exit(1);
X }
X /* finish the pointers */
X max_files = 50;
X debug(("First vector allocated @ %08lx, size %d bytes\n",
X (long) filelist, 50*sizeof(filedesc)
X ));
X fprintf(stderr, "build list...");
X
X /* this is the build loop */
X while (loc = ftell(namefd), fgets(curfile, MAXFN, namefd) != NULL) {
X /* check for room in the buffer */
X if (n_files == max_files) {
X /* allocate more space */
X max_files += 50;
X filelist =
X (filedesc *) realloc(filelist, (max_files)*sizeof(filedesc));
X if (filelist == NULL) {
X perror("Out of memory!");
X exit(1);
X }
X debug(("Got more memory!\n"));
X }
X curfile[strlen(curfile)-1] = EOS;
X
X /* add the data for this one */
X if (stat(curfile, &statbuf)) {
X fprintf(stderr, "%s - ", curfile);
X perror("ignored");
X continue;
X }
X
X /* check for zero length files */
X if ( statbuf.st_size == 0) {
X if (zl_hdr) {
X zl_hdr = 0;
X printf("Zero length files:\n\n");
X }
X printf("%s\n", curfile);
X continue;
X }
X
X curptr = filelist + n_files++;
X curptr->nameloc = loc;
X curptr->length = statbuf.st_size;
X curptr->device = statbuf.st_dev;
X curptr->inode = statbuf.st_ino;
X curptr->flags = 0;
X debug(("Name[%d] %s, size %ld, inode %d\n", n_files, curfile,
X (long) statbuf.st_size, statbuf.st_ino
X ));
X }
X
X /* sort the list by size, device, and inode */
X fprintf(stderr, "sort...");
X SORT;
X
X /* make the first scan for equal lengths */
X fprintf(stderr, "scan1...");
X scan1();
X
X /* make the second scan for dup CRC also */
X fprintf(stderr, "scan2...");
X scan2();
X
X fprintf(stderr, "done\n");
X
X#ifdef DEBUG
X for (loc = 0; loc < n_files; ++loc) {
X curptr = filelist + loc;
X printf("%8ld %08lx %6d %6d %02x\n",
X curptr->length, curptr->crc32,
X curptr->device, curptr->inode,
X curptr->flags
X );
X }
X#endif
X
X /* now scan and output dups */
X scan3();
X
X exit(0);
X}
X
X/* comp1 - compare two values */
Xint
Xcomp1(p1, p2)
Xchar *p1, *p2;
X{
X register filedesc *p1a = (filedesc *)p1, *p2a = (filedesc *)p2;
X register int retval;
X
X (retval = p1a->length - p2a->length) ||
X (retval = p1a->crc32 - p2a->crc32) ||
X (retval = p1a->device - p2a->device) ||
X (retval = p1a->inode - p2a->inode);
X
X return retval;
X}
X
X/* scan1 - get a CRC32 for files of equal length */
X
Xvoid
Xscan1() {
X FILE *fp;
X int ix, needsort = 0;
X
X for (ix = 1; ix < n_files; ++ix) {
X if (filelist[ix-1].length == filelist[ix].length) {
X /* get a CRC for each */
X if (! GetFlag(ix-1, FL_CRC)) {
X filelist[ix-1].crc32 = get_crc(ix-1);
X SetFlag(ix-1, FL_CRC);
X }
X if (! GetFlag(ix, FL_CRC)) {
X filelist[ix].crc32 = get_crc(ix);
X SetFlag(ix, FL_CRC);
X }
X needsort = 1;
X }
X }
X
X if (needsort) SORT;
X}
X
X/* scan2 - full compare if CRC is equal */
X
Xvoid
Xscan2() {
X int ix, ix2, lastix;
X int inmatch; /* 1st filename has been printed */
X int need_hdr = 1; /* Need a hdr for the hard link list */
X int lnkmatch; /* flag for matching links */
X register filedesc *p1, *p2;
X filedesc wkdesc;
X
X /* mark links and output before dup check */
X for (ix = 0; ix < n_files; ix = ix2) {
X p1 = filelist + ix;
X for (ix2 = ix+1, p2 = p1+1, inmatch = 0;
X ix2 < n_files
X && p1->device == p2->device
X && p1->inode == p2->inode;
X ++ix2, ++p2
X ) {
X SetFlag(ix2, FL_LNK);
X if (need_hdr) {
X need_hdr = 0;
X printf("\n\nHard link summary:\n\n");
X }
X
X if (!inmatch) {
X inmatch = 1;
X printf("\nFILE: %s\n", getfn(ix));
X }
X printf("LINK: %s\n", getfn(ix2));
X }
X }
X debug(("\nStart dupscan"));
X
X /* now really scan for duplicates */
X for (ix = 0; ix < n_files; ix = lastix) {
X p1 = filelist + ix;
X for (lastix = ix2 = ix+1, p2 = p1+1, lnkmatch = 1;
X ix2 < n_files
X && p1->length == p2->length
X && p1->crc32 == p2->crc32;
X ++ix2, ++p2
X ) {
X if ((GetFlag(ix2, FL_LNK) && lnkmatch)
X || fullcmp(ix, ix2) == 0
X ) {
X SetFlag(ix2, FL_DUP);
X /* move if needed */
X if (lastix != ix2) {
X int n1, n2;
X
X debug(("\n swap %d and %d", lastix, ix2));
X wkdesc = filelist[ix2];
X for (n1 = ix2; n1 > lastix; --n1) {
X filelist[n1] = filelist[n1-1];
X }
X filelist[lastix++] = wkdesc;
X }
X lnkmatch = 1;
X }
X else {
X /* other links don't match */
X lnkmatch = 0;
X }
X }
X }
X}
X
X/* scan3 - output dups */
X
Xvoid
Xscan3()
X{
X register filedesc *p1, *p2;
X int ix, ix2, inmatch, need_hdr = 1;
X
X /* now repeat for duplicates, links or not */
X for (ix = 0; ix < n_files; ++ix) {
X if (GetFlag(ix, FL_DUP)) {
X /* header on the very first */
X if (need_hdr) {
X need_hdr = 0;
X printf("\n\nList of files with duplicate contents %s\n",
X "(includes hard links)"
X );
X }
X
X /* put out a header if you haven't */
X if (!inmatch) printf("\nFILE: %s\n", getfn(ix-1));
X inmatch = 1;
X printf("DUP: %s\n", getfn(ix));
X }
X else inmatch = 0;
X }
X}
X
X/* get_crc - get a CRC32 for a file */
X
Xunsigned long
Xget_crc(ix)
Xint ix;
X{
X FILE *fp;
X register unsigned long val1 = 0x90909090, val2 = 0xeaeaeaea;
X register int carry;
X char ch;
X char fname[MAXFN];
X
X /* open the file */
X fseek(namefd, filelist[ix].nameloc, 0);
X fgets(fname, MAXFN, namefd);
X fname[strlen(fname)-1] = EOS;
X debug(("\nCRC start - %s ", fname));
X if ((fp = fopen(fname, "r")) == NULL) {
X fprintf(stderr, "Can't read file %s\n", fname);
X exit(1);
X }
X
X /* build the CRC values */
X while ((ch = fgetc(fp)) != EOF) {
X carry = (val1 & 0x8000000) != 0;
X val1 = (val1 << 1) ^ ch + carry;
X val2 += ch << (ch & 003);
X }
X fclose(fp);
X debug(("v1: %08lx v2: %08lx ", val1, val2));
X
X return ((val1 & 0xffff) << 12) ^ (val2 && 0xffffff);
X}
X
X/* getfn - get filename from index */
X
Xchar *
Xgetfn(ix)
Xoff_t ix;
X{
X static char fnbuf[MAXFN];
X
X fseek(namefd, filelist[ix].nameloc, 0);
X fgets(fnbuf, MAXFN, namefd);
X fnbuf[strlen(fnbuf)-1] = EOS;
X
X return fnbuf;
X}
X
X/* fullcmp - compare two files, bit for bit */
X
Xint
Xfullcmp(v1, v2)
Xint v1, v2;
X{
X FILE *fp1, *fp2;
X char filename[MAXFN];
X register int ch;
X
X /* open the files */
X strcpy(filename, getfn(v1));
X fp1 = fopen(filename, "r");
X if (fp1 == NULL) {
X fprintf(stderr, "%s: ", filename);
X perror("can't access for read");
X exit(1);
X }
X debug(("\nFull compare %s\n and", filename));
X
X strcpy(filename, getfn(v2));
X fp2 = fopen(filename, "r");
X if (fp2 == NULL) {
X fprintf(stderr, "%s: ", filename);
X perror("can't access for read");
X exit(1);
X }
X debug(("%s", filename));
X
X /* now do the compare */
X while ((ch = getc(fp1)) != EOF) {
X if (ch - getc(fp2)) break;
X }
X
X /* close files and return value */
X fclose(fp1);
X fclose(fp2);
X debug(("\n return %d", !(ch == EOF)));
X return (!(ch == EOF));
X}
SHAR_EOF
chmod 0644 finddup.c || echo "restore of finddup.c fails"
echo "x - extracting finddup.1C (Text)"
sed 's/^X//' << 'SHAR_EOF' > finddup.1C &&
X.TH FINDDUP 1 LOCAL
X.SH NAME
Xfinddup - find duplicate files in a list
X.SH SYNOPSIS
Xfinddup filename
X.SH DESCRIPTION
X.ds fd \fBfinddup\fP
X\*(fd reads a list of filenames from the named file and scans them,
Xbuilding a list of duplicate files and hard links. These are then
Xwritten to stdout for the user's information. This can be used to reduce
Xdisk usage, etc.
X.SS How it works
X\*(fd stats each name and saves the file length, device, and inode. It
Xthen sorts the list and builds a CRC for each file which has the same
Xlength as another file. For files which have the same length and CRC, a
Xbyte by byte comparison is done to be sure that they are duplicates.
X.sp
XThe CRC step for N files of size S bytes requires reading n*S total
Xbytes, while the byte by byte check must be done for every file against
Xevery other, and read S*N*(N-1) bytes. Thus the CRC is a large timesaver
Xin most cases.
X.SH EXAMPLES
X $ find /u -type f -print > file.list.tmp
X $ finddup file.list.tmp
X.SH FILES
XOnly the file with the filenames.
X.SH SEE ALSO
Xfind(1).
X.SH DIAGNOSTICS
XIf files are named in the specification file but not present they will
Xbe ignored. If an existing file can not be read the program will
Xterminate rather than generate an incomplete list of duplicates.
X.SH LIMITATIONS
XAn option to generate a partial list could be added when a file can't be
Xaccessed. An option to list only duplites which are not hard links could
Xbe added.
X.SH AUTHOR
XBill Davidsen, davidsen at crdos1.crd.ge.com
X.SH Copyright
XCopyright (c) 1991 by Bill Davidsen, all rights reserved. This
Xprogram may be freely used and distributed in its original
Xform. Modified versions must be distributed with the original
Xunmodified source code, and redistribution of the original code
Xor any derivative program may not be restricted.
SHAR_EOF
chmod 0666 finddup.1C || echo "restore of finddup.1C fails"
echo "x - extracting makefile (Text)"
sed 's/^X//' << 'SHAR_EOF' > makefile &&
X# makefile for finddup
X
X# ================> custom here
XCC = /bin/cc
XCFLAGS = -O
XLFLAGS =
XLIBS =
X# your favorite shar prog
XSHAR = shar
XSHARLST = finddup.c finddup.1C makefile
X# ================> custom ends
X
Xfinddup: finddup.o
X $(CC) -o finddup $(LFLAGS) finddup.o -s
X
Xdebug: rfinddup
X
X# note that this does NOT leave an object file
Xrfinddup: finddup.c
X $(CC) -o rfinddup -g -DDEBUG finddup.c
X
Xshar: finddup.shar
X
Xfinddup.shar: $(SHARLST)
X $(SHAR) $(SHARLST) > finddup.shar
X
Xclean:
X [ -f p.finddup.c -o ! -f s.finddup.c ] && exit 1
X rm -f finddup.[co] finddup rfinddup finddup.shar
SHAR_EOF
chmod 0644 makefile || echo "restore of makefile fails"
exit 0
exit 0 # Just in case...
--
Kent Landfield INTERNET: kent at sparky.IMD.Sterling.COM
Sterling Software, IMD UUCP: uunet!sparky!kent
Phone: (402) 291-8300 FAX: (402) 291-4362
Please send comp.sources.misc-related mail to kent at uunet.uu.net.
More information about the Comp.sources.misc
mailing list