v13i028: Replacement for the file(1) command, Part02/02
Rich Salz
rsalz at bbn.com
Tue Feb 9 07:26:33 AEST 1988
Submitted-by: "Ian F. Darwin" <ian at sq.com>
Posting-number: Volume 13, Issue 28
Archive-name: file/part02
: to unbundle, sh this file
echo x - strtok.c 1>&2
cat >strtok.c <<'@@@End of strtok.c'
/*
* Get next token from string s (NULL on 2nd, 3rd, etc. calls),
* where tokens are nonempty strings separated by runs of
* chars from delim. Writes NULs into s to end tokens. delim need not
* remain constant from call to call.
*
* Copyright (c) Henry Spencer.
* Written by Henry Spencer.
*
* This software is not subject to any license of the American Telephone
* and Telegraph Company or of the Regents of the University of California.
*
* Permission is granted to anyone to use this software for any purpose on
* any computer system, and to alter it and redistribute it freely, subject
* to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from flaws in it.
*
* 2. The origin of this software must not be misrepresented, either by
* explicit claim or by omission. Since few users ever read sources,
* credits must appear in the documentation.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software. Since few users
* ever read sources, credits must appear in the documentation.
*
* 4. This notice may not be removed or altered.
*/
#define NULL 0
#define CONST
static char *scanpoint = NULL;
char * /* NULL if no token left */
strtok(s, delim)
char *s;
register CONST char *delim;
{
register char *scan;
char *tok;
register CONST char *dscan;
if (s == NULL && scanpoint == NULL)
return(NULL);
if (s != NULL)
scan = s;
else
scan = scanpoint;
/*
* Scan leading delimiters.
*/
for (; *scan != '\0'; scan++) {
for (dscan = delim; *dscan != '\0'; dscan++)
if (*scan == *dscan)
break;
if (*dscan == '\0')
break;
}
if (*scan == '\0') {
scanpoint = NULL;
return(NULL);
}
tok = scan;
/*
* Scan token.
*/
for (; *scan != '\0'; scan++) {
for (dscan = delim; *dscan != '\0';) /* ++ moved down. */
if (*scan == *dscan++) {
scanpoint = scan+1;
*scan = '\0';
return(tok);
}
}
/*
* Reached end of string.
*/
scanpoint = NULL;
return(tok);
}
@@@End of strtok.c
echo x - strchr.c 1>&2
cat >strchr.c <<'@@@End of strchr.c'
/*
* Local copy of strchr (a.k.a. index) for portability.
* Totally public domain.
*/
#include <stdio.h>
char *
strchr(s, c)
char *s, c;
{
char *x = s;
while (*x != c)
if (*x == '\0')
return(NULL);
else
++x;
return(x);
}
@@@End of strchr.c
echo x - file.h 1>&2
cat >file.h <<'@@@End of file.h'
/*
* file.h - definitions for file(1) program
# @(#)$Header: file.h,v 1.4 87/09/18 10:56:09 ian Exp $
*
* Copyright (c) Ian F. Darwin, 1987.
* Written by Ian F. Darwin.
*
* This software is not subject to any license of the American Telephone
* and Telegraph Company or of the Regents of the University of California.
*
* Permission is granted to anyone to use this software for any purpose on
* any computer system, and to alter it and redistribute it freely, subject
* to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from flaws in it.
*
* 2. The origin of this software must not be misrepresented, either by
* explicit claim or by omission. Since few users ever read sources,
* credits must appear in the documentation.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software. Since few users
* ever read sources, credits must appear in the documentation.
*
* 4. This notice may not be removed or altered.
*/
#define HOWMANY 1024 /* how much of the file to look at */
#define MAXMAGIS 250 /* max entries in /etc/magic */
#define MAXDESC 50 /* max leng of text description */
#define MAXstring 32 /* max leng of "string" types */
#define ckfputs(str,fil) {if (fputs(str,fil)==EOF) error(ckfmsg,"");}
struct magic {
short contflag; /* 1 if '>0' appears */
long offset; /* offset to magic number */
char reln; /* relation (0=eq, '>'=gt, etc) */
char type; /* int, short, long or string. */
char vallen; /* length of string value, if any */
#define BYTE 1
#define SHORT 2
#define LONG 4
#define STRING 5
union VALUETYPE {
char b;
short h;
long l;
char s[MAXstring];
} value; /* either number or string */
char desc[MAXDESC]; /* description */
};
extern void error(), exit();
@@@End of file.h
echo x - names.h 1>&2
cat >names.h <<'@@@End of names.h'
/*
* Names.h - names and types used by ascmagic in file(1).
* These tokens are here because they can appear anywhere in
* the first HOWMANY bytes, while tokens in /etc/magic must
* appear at fixed offsets into the file. Don't make HOWMANY
* too high unless you have a very fast CPU.
*
* Copyright (c) Ian F. Darwin, 1987.
* Written by Ian F. Darwin.
*
* This software is not subject to any license of the American Telephone
* and Telegraph Company or of the Regents of the University of California.
*
* Permission is granted to anyone to use this software for any purpose on
* any computer system, and to alter it and redistribute it freely, subject
* to the terms in the accompanying LEGAL.NOTICE file.
*/
/* these types are used to index the table 'types': keep em in sync! */
#define L_C 0 /* first and foremost on UNIX */
#define L_FORT 1 /* the oldest one */
#define L_MAKE 2 /* Makefiles */
#define L_PLI 3 /* PL/1 */
#define L_MACH 4 /* some kinda assembler */
#define L_ENG 5 /* English */
#define L_PAS 6 /* Pascal */
#define L_MAIL 7 /* Electronic mail */
#define L_NEWS 8 /* Usenet Netnews */
char *types[] = {
"c program text",
"fortran program text",
"make commands text" ,
"pl/1 program text",
"assembler program text",
"English text",
"pascal program text",
"mail text",
"news text",
"can't happen error on names.h/types",
0};
struct names {
char *name;
short type;
} names[] = {
/* These must be sorted by eye for optimal hit rate */
/* Add to this list only after substantial meditation */
{"/*", L_C}, /* must preced "The", "the", etc. */
{"#include", L_C},
{"char", L_C},
{"The", L_ENG},
{"the", L_ENG},
{"double", L_C},
{"extern", L_C},
{"float", L_C},
{"real", L_C},
{"struct", L_C},
{"union", L_C},
{"CFLAGS", L_MAKE},
{"LDFLAGS", L_MAKE},
{"all:", L_MAKE},
{".PRECIOUS", L_MAKE},
/* Too many files of text have these words in them. Find another way
* to recognize Fortrash.
*/
#ifdef NOTDEF
{"subroutine", L_FORT},
{"function", L_FORT},
{"block", L_FORT},
{"common", L_FORT},
{"dimension", L_FORT},
{"integer", L_FORT},
{"data", L_FORT},
#endif /*NOTDEF*/
{".ascii", L_MACH},
{".asciiz", L_MACH},
{".byte", L_MACH},
{".even", L_MACH},
{".globl", L_MACH},
{"clr", L_MACH},
{"(input,", L_PAS},
{"dcl", L_PLI},
{"Received:", L_MAIL},
{">From", L_MAIL},
{"Return-Path:",L_MAIL},
{"Cc:", L_MAIL},
{"Newsgroups:", L_NEWS},
{"Path:", L_NEWS},
{"Organization:",L_NEWS},
0};
#define NNAMES ((sizeof(names)/sizeof(struct names)) - 1)
@@@End of names.h
echo x - tar.h 1>&2
cat >tar.h <<'@@@End of tar.h'
/*
* Header file for public domain tar (tape archive) program.
*
* @(#)tar.h 1.20 86/10/29 Public Domain.
*
* Created 25 August 1985 by John Gilmore, ihnp4!hoptoad!gnu.
*/
/*
* Kludge for handling systems that can't cope with multiple
* external definitions of a variable. In ONE routine (tar.c),
* we #define TAR_EXTERN to null; here, we set it to "extern" if
* it is not already set.
*/
#ifndef TAR_EXTERN
#define TAR_EXTERN extern
#endif
/*
* Header block on tape.
*
* I'm going to use traditional DP naming conventions here.
* A "block" is a big chunk of stuff that we do I/O on.
* A "record" is a piece of info that we care about.
* Typically many "record"s fit into a "block".
*/
#define RECORDSIZE 512
#define NAMSIZ 100
#define TUNMLEN 32
#define TGNMLEN 32
union record {
char charptr[RECORDSIZE];
struct header {
char name[NAMSIZ];
char mode[8];
char uid[8];
char gid[8];
char size[12];
char mtime[12];
char chksum[8];
char linkflag;
char linkname[NAMSIZ];
char magic[8];
char uname[TUNMLEN];
char gname[TGNMLEN];
char devmajor[8];
char devminor[8];
} header;
};
/* The checksum field is filled with this while the checksum is computed. */
#define CHKBLANKS " " /* 8 blanks, no null */
/* The magic field is filled with this if uname and gname are valid. */
#define TMAGIC "ustar " /* 7 chars and a null */
/* The linkflag defines the type of file */
#define LF_OLDNORMAL '\0' /* Normal disk file, Unix compat */
#define LF_NORMAL '0' /* Normal disk file */
#define LF_LINK '1' /* Link to previously dumped file */
#define LF_SYMLINK '2' /* Symbolic link */
#define LF_CHR '3' /* Character special file */
#define LF_BLK '4' /* Block special file */
#define LF_DIR '5' /* Directory */
#define LF_FIFO '6' /* FIFO special file */
#define LF_CONTIG '7' /* Contiguous file */
/* Further link types may be defined later. */
/*
* Exit codes from the "tar" program
*/
#define EX_SUCCESS 0 /* success! */
#define EX_ARGSBAD 1 /* invalid args */
#define EX_BADFILE 2 /* invalid filename */
#define EX_BADARCH 3 /* bad archive */
#define EX_SYSTEM 4 /* system gave unexpected error */
/*
* Global variables
*/
TAR_EXTERN union record *ar_block; /* Start of block of archive */
TAR_EXTERN union record *ar_record; /* Current record of archive */
TAR_EXTERN union record *ar_last; /* Last+1 record of archive block */
TAR_EXTERN char ar_reading; /* 0 writing, !0 reading archive */
TAR_EXTERN int blocking; /* Size of each block, in records */
TAR_EXTERN int blocksize; /* Size of each block, in bytes */
TAR_EXTERN char *ar_file; /* File containing archive */
TAR_EXTERN char *name_file; /* File containing names to work on */
TAR_EXTERN char *tar; /* Name of this program */
/*
* Flags from the command line
*/
TAR_EXTERN char f_reblock; /* -B */
TAR_EXTERN char f_create; /* -c */
TAR_EXTERN char f_debug; /* -d */
TAR_EXTERN char f_sayblock; /* -D */
TAR_EXTERN char f_follow_links; /* -h */
TAR_EXTERN char f_ignorez; /* -i */
TAR_EXTERN char f_keep; /* -k */
TAR_EXTERN char f_modified; /* -m */
TAR_EXTERN char f_oldarch; /* -o */
TAR_EXTERN char f_use_protection; /* -p */
TAR_EXTERN char f_sorted_names; /* -s */
TAR_EXTERN char f_list; /* -t */
TAR_EXTERN char f_namefile; /* -T */
TAR_EXTERN char f_verbose; /* -v */
TAR_EXTERN char f_extract; /* -x */
TAR_EXTERN char f_compress; /* -z */
/*
* We now default to Unix Standard format rather than 4.2BSD tar format.
* The code can actually produce all three:
* f_standard ANSI standard
* f_oldarch V7
* neither 4.2BSD
* but we don't bother, since 4.2BSD can read ANSI standard format anyway.
* The only advantage to the "neither" option is that we can cmp(1) our
* output to the output of 4.2BSD tar, for debugging.
*/
#define f_standard (!f_oldarch)
/*
* Structure for keeping track of filenames and lists thereof.
*/
struct name {
struct name *next;
short length;
char found;
char name[NAMSIZ+1];
};
TAR_EXTERN struct name *namelist; /* Points to first name in list */
TAR_EXTERN struct name *namelast; /* Points to last name in list */
TAR_EXTERN int archive; /* File descriptor for archive file */
TAR_EXTERN int errors; /* # of files in error */
/*
*
* Due to the next struct declaration, each routine that includes
* "tar.h" must also include <sys/types.h>. I tried to make it automatic,
* but System V has no defines in <sys/types.h>, so there is no way of
* knowing when it has been included. In addition, it cannot be included
* twice, but must be included exactly once. Argghh!
*
* Thanks, typedef. Thanks, USG.
*/
struct link {
struct link *next;
dev_t dev;
ino_t ino;
short linkcount;
char name[NAMSIZ+1];
};
TAR_EXTERN struct link *linklist; /* Points to first link in list */
/*
* Error recovery stuff
*/
TAR_EXTERN char read_error_flag;
/*
* Declarations of functions available to the world.
*/
/*LINTLIBRARY*/
union record *findrec();
void userec();
union record *endofrecs();
void anno();
#define annorec(stream, msg) anno(stream, msg, 0) /* Cur rec */
#define annofile(stream, msg) anno(stream, msg, 1) /* Saved rec */
@@@End of tar.h
echo x - Makefile 1>&2
cat >Makefile <<'@@@End of Makefile'
# Makefile for file(1) cmd.
# Copyright (c) Ian F. Darwin 86/09/01 - see LEGAL.NOTICE.
# @(#)$Header: Makefile,v 1.17 88/01/15 13:03:16 ian Exp $
#
SHELL = /bin/sh
MAGIC = /etc/magic
DEFS = -DMAGIC='"$(MAGIC)"' # -Dvoid=int
COPTS = -O # -g
CFLAGS = $(COPTS) $(DEFS)
SHAR = bundle
OFILE = /bin/file.orig # old or distributed version, for comparison
# Where new binary lives; typically /usr/local (BSD), /usr/lbin (USG).
BINDIR = /usr/local
# For installing our man pages;
# MANCxxx is manual section for Commands, MANFxxx is section for file formats.
# MANxDIR is directory names; MANxEXT is the filename extention. Usual values:
# Variable V7 4BSD Sys V
# MANCDIR /usr/man/man1 /usr/man/man1 /usr/man/u_man/man1
# MANFDIR /usr/man/man5 /usr/man/man5 /usr/man/u_man/man4
# MANCEXT 1 1 1
# MANFEXT 5 5 4
# --- possible alternative for 4BSD ---
# MANCDIR /usr/man/manl
# MANCEXT l
# --- possible alternative for USG ---
# MANCDIR /usr/man/local/man1
# MANCEXT 1
MANCDIR = /usr/man/manl
MANFDIR = /usr/man/man5
MANCEXT = l
MANFEXT = 5
# There are no system-dependant configuration options (except maybe CFLAGS).
# Delete any of LOCALSRCS and LOCALOBJS that are in your C library.
LOCALSRCS = getopt.c strtol.c strtok.c strchr.c
SRCS = file.c apprentice.c fsmagic.c softmagic.c ascmagic.c is_tar.c \
print.c $(LOCALSRCS)
LOCALOBJS = getopt.o strtol.o strtok.o strchr.o
OBJS = file.o apprentice.o fsmagic.o softmagic.o ascmagic.o is_tar.o \
print.o $(LOCALOBJS)
ALLSRC = LEGAL.NOTICE README PORTING $(SRCS) *.h \
Makefile file.1 magic.4 magdir/[a-z]* tst/Makefile
all: file magic
try: all $(OFILE)
cd tst; make
time $(OFILE) -m ./magic * tst/* >/tmp/t1
time ./file -m ./magic * tst/* >/tmp/t2
-diff -b /tmp/t[12]
what ./file >lastnocore
file: $(OBJS)
cc $(CFLAGS) $(OBJS) -o $@
lint: $(SRCS)
lint -ha $(DEFS) $(SRCS) | tee $@
magic: magdir
# exclude RCS or SCCS dirs:
cat magdir/[a-z]* >$@
ascmagic.o: names.h
apprentice.o ascmagic.o file.o fsmagic.o print.o softmagic.o: file.h
install: file magic file.1 magic.4 $(BINDIR) $(MANCDIR) $(MANCDIR)
cp file $(BINDIR)/file
cp magic $(MAGIC)
cp file.1 $(MANCDIR)/file.$(MANCEXT)
cp magic.4 $(MANFDIR)/magic.$(MANFEXT)
clean:
rm -f *.o file magic lint.out
(cd tst; make clean)
dist: $(ALLSRC)
# Some versions of shar can't handle a single file from
# a subdirectory, so we manually insert mkdir as needed.
# Put the extra "mkdir" AFTER the ": to unbundle..." line.
$(SHAR) $(ALLSRC) | sed -e '1a\
mkdir magdir tst' >$@
@@@End of Makefile
echo x - file.1 1>&2
cat >file.1 <<'@@@End of file.1'
..TH FILE 1 "Copyright but distributable"
..SH NAME
..I file
\- determine file type
..SH SYNOPSIS
..B file
[
..B -c
]
[
..B -f
namefile ]
[
..B -m
magicfile ]
file ...
..SH DESCRIPTION
..I File
tests each argument in an attempt to classify it.
There are three sets of tests, performed in this order:
filesystem tests, magic number tests, and language tests.
The
..I first
test that succeeds causes the file type to be printed.
..PP
The type printed will usually contain one of the words
..B text
(the file contains only ASCII characters and is
probably safe to read on an ASCII terminal),
..B executable
(the file contains the result of compiling a program
in a form understandable to some \s-1UNIX\s0 kernel or another),
or
..B data
meaning anything else (data is usually `binary' or non-printable).
Exceptions are well-known file formats (core files, tar archives)
that are known to contain binary data.
When modifying the file
..I /etc/magic
or the program itself,
..B "preserve these keywords" .
People depend on knowing that all the readable files in a directory
have the word ``text'' printed.
Don't do as one computer vendor did \- change ``shell commands text''
to ``shell script''.
..PP
The filesystem tests are based on examining the return from a
..I stat (2)
system call.
The program checks to see if the file is empty,
or if it's some sort of special file.
Any known file types appropriate to the system you are running on
(sockets and symbolic links on 4.2BSD, named pipes (FIFOs) on System V)
are intuited if they are defined in
the system header file
..I sys/stat.h .
..PP
The magic number tests are used to check for files with data in
particular fixed formats.
The canonical example of this is a binary executable (compiled program)
..I a.out
file, whose format is defined in
..I a.out.h
and possibly
..I exec.h
in the standard include directory.
These files have a `magic number' stored in a particular place
near the beginning of the file that tells the \s-1UNIX\s0 operating system
that the file is a binary executable, and which of several types thereof.
The concept of `magic number' has been applied by extension to data files.
Any file with some invariant identifier at a small fixed
offset into the file can usually be described in this way.
The information in these files is read from the magic file
..I /etc/magic .
..PP
If an argument appears to be an
..SM ASCII
file,
..I file
attempts to guess its language.
The language tests look for particular strings (cf \fInames.h\fP)
that can appear anywhere in the first few blocks of a file.
For example, the keyword
..I .br
indicates that the file is most likely a troff input file,
just as the keyword
..I struct
indicates a C program.
These tests are less reliable than the previous
two groups, so they are performed last.
The language test routines also test for some miscellany
(such as
..I tar
archives) and determine whether an unknown file should be
labelled as `ascii text' or `data'.
..PP
Use
..B -m
..I file
to specify an alternate file of magic numbers.
..PP
The
..B -c
option causes a checking printout of the parsed form of the magic file.
This is usually used in conjunction with
..B -m
to debug a new magic file before installing it.
..PP
The
..B -f
..I namefile
option specifies that the names of the files to be examined
are to be read (one per line) from
..I namefile
before the argument list.
Either
..I namefile
or at least one filename argument must be present;
to test the standard input, use ``-'' as a filename argument.
..SH FILES
..I /etc/magic
\- default list of magic numbers
..SH SEE ALSO
..IR Magic (FILES)
\- description of magic file format.
..br
..IR Strings (1), " od" (1)
\- tools for examining non-textfiles.
..SH STANDARDS CONFORMANCE
This program is believed to exceed the System V Interface Definition
of FILE(CMD), as near as one can determine from the vague language
contained therein.
Its behaviour is mostly compatible with the System V program of the same name.
This version knows more magic, however, so it will produce
different (albeit more accurate) output in many cases.
..PP
The one significant difference
between this version and System V
is that this version treats any white space
as a delimiter, so that spaces in pattern strings must be escaped.
For example,
..br
>10 string language impress\ (imPRESS data)
..br
in an existing magic file would have to be changed to
..br
>10 string language\e impress (imPRESS data)
..PP
The Sun Microsystems implementation of System V compatibility
includes a file(1) command that has some extentions.
My version differs from Sun's only in minor ways.
The significant one is the `&' operator, which Sun's program expects as,
for example,
..br
>16 long&0x7fffffff >0 not stripped
..br
would be entered in my version as
..br
>16 long &0x7fffffff not stripped
..br
which is a little less general; it simply tests (location 16)&0x7ffffff
and returns its truth value as a C expression.
..SH MAGIC DIRECTORY
The magic file entries have been collected from various sources,
mainly USENET, and contributed by various authors.
Ian Darwin (address below) will collect additional
or corrected magic file entries.
A consolidation of magic file entries
will be distributed periodically.
..PP
The order of entries in the magic file is significant.
Depending on what system you are using, the order that
they are put together may be incorrect.
If your old
..I file
command uses a magic file,
keep the old magic file around for comparison purposes
(rename it to
..IR /etc/magic.orig ).
..SH HISTORY
There has been a
..I file
command in every UNIX since at least Research Version 6
(man page dated January, 1975).
The System V version introduced one significant major change:
the external list of magic number types.
This slowed the program down slightly but made it a lot more flexible.
..PP
This program, based on the System V version,
was written by Ian Darwin without looking at anybody else's source code.
..PP
John Gilmore revised the code extensively, making it better than
the first version.
Geoff Collyer found several inadequacies
and provided some magic file entries.
The program has undergone continued evolution since.
..SH NOTICE
Copyright (c) Ian F. Darwin, 1986 and 1987.
Written by Ian F. Darwin, UUCP address {utzoo | ihnp4}!darwin!ian,
Internet address ian at sq.com,
postal address: P.O. Box 603, Station F, Toronto, Ontario, CANADA M4Y 2L8.
..PP
..I Strtok.c
and
..I getopt.c
written by and copyright by Henry Spencer, utzoo!henry.
..PP
This software is not subject to any license of the American Telephone
and Telegraph Company or of the Regents of the University of California.
..PP
Permission is granted to anyone to use this software for any purpose on
any computer system, and to alter it and redistribute it freely, subject
to the following restrictions:
..PP
1. The author is not responsible for the consequences of use of this
software, no matter how awful, even if they arise from flaws in it.
..PP
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. Since few users ever read sources,
credits must appear in the documentation.
..PP
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software. Since few users
ever read sources, credits must appear in the documentation.
..PP
4. This notice may not be removed or altered.
..PP
A few support files (\fIgetopt\fP, \fIstrtok\fP)
distributed with this package
are by Henry Spencer and are subject to the same terms as above.
..PP
A few simple support files (\fIstrtol\fP, \fIstrchr\fP)
distributed with this package
are in the public domain; they are so marked.
..PP
The files
..I tar.h
and
..I is_tar.c
were written by John Gilmore from his public-domain
..I tar
program, and are not covered by the above restrictions.
..SH BUGS
There must be a way to automate the construction of the Magic
file from all the glop in magdir. What is it?
..PP
..I File
uses several algorithms that favor speed over accuracy,
thus it can be misled about the contents of ASCII files.
..PP
The support for ASCII files (primarily for programming languages)
is simplistic, inefficient and requires recompilation to update.
..PP
Should there be an ``else'' clause to follow a series of continuation lines?
..PP
Is it worthwhile to implement recursive file inspection,
so that compressed files, uuencoded, etc., can say ``compressed
ascii text'' or ``compressed executable'' or ``compressed tar archive"
or whatever?
..PP
The magic file and keywords should have regular expression support.
..PP
It might be advisable to allow upper-case letters in keywords
for e.g., troff commands vs man page macros.
Regular expression support would make this easy.
..PP
The program doesn't grok \s-2FORTRAN\s0.
It should be able to figure \s-2FORTRAN\s0 by seeing some keywords which
appear indented at the start of line.
Regular expression support would make this easy.
..PP
The list of keywords in
..I ascmagic
probably belongs in the Magic file.
This could be done by using some keyword like `*' for the offset value.
..PP
The program should malloc the magic file structures,
rather than using a fixed-size array as at present.
..PP
The magic file should be compiled into binary
(or better yet, fixed-length ASCII strings
for use in heterogenous network environments) for faster startup.
Then the program would run as fast as the Version 7 program of the same name,
with the flexibility of the System V version.
But then there would have to be yet another magic number for the
..I magic.out
file.
..PP
Another optimisation would be to sort
the magic file so that we can just run down all the
tests for the first byte, first word, first long, etc, once we
have fetched it. Complain about conflicts in the magic file entries.
Make a rule that the magic entries sort based on file offset rather
than position within the magic file?
..PP
The program should provide a way to give an estimate
of ``how good'' a guess is.
We end up removing guesses (e.g. ``From '' as first 5 chars of file) because
they are not as good as other guesses (e.g. ``Newsgroups:'' versus
"Return-Path:"). Still, if the others don't pan out, it should be
possible to use the first guess.
..PP
Perhaps the program should automatically try all tests with
byte-swapping done, to avoid having to figure out the byte-swapped values
when constructing the magic file.
Of course this will run more slowly, so it should probably be
an option (-a?).
..PP
This manual page, and particularly this section, is too long.
@@@End of file.1
echo x - magic.4 1>&2
cat >magic.4 <<'@@@End of magic.4'
..TH MAGIC FILES "Public Domain"
..\" install as magic.4 on USG, magic.5 on V7 or Berkeley systems.
..SH NAME
magic \- file command's magic number file
..SH DESCRIPTION
The
..IR file (1)
command identifies the type of a file using,
among other tests,
a test for whether the file begins with a certain
..IR "magic number" .
The file
..B /etc/magic
specifies what magic numbers are to be tested for,
what message to print if a particular magic number is found,
and additional information to extract from the file.
..PP
Each line of the file specifies a test to be performed.
A test compares the data starting at a particular offset
in the file with a 1-byte, 2-byte, or 4-byte numeric value or
a string. If the test succeeds, a message is printed.
The line consists of the following fields:
..IP offset \w'message'u+2n
A number specifying the offset, in bytes, into the file of the data
which is to be tested.
..IP type
The type of the data to be tested. The possible values are:
..RS
..IP byte \w'message'u+2n
A one-byte value.
..IP short
A two-byte value (on most systems).
..IP long
A four-byte value (on most systems).
..IP string
A string of bytes.
..RE
..IP test
The value to be compared with the value from the file. If the type is
numeric, this value
is specified in C form; if it is a string, it is specified as a C string
with the usual escapes permitted (e.g. \en for new-line).
..IP
Numeric values
may be preceded by a character indicating the operation to be performed.
It may be
..BR = ,
to specify that the value from the file must equal the specified value,
..BR < ,
to specify that the value from the file must be less than the specified
value,
..BR > ,
to specify that the value from the file must be greater than the specified
value,
or
..BR & ,
to specify that the value is to be AND'ed with the
numeric value before any comparisons are done.
Numeric values are specified in C form; e.g.
..B 13
is decimal,
..B 013
is octal, and
..B 0x13
is hexadecimal.
to specify that any value will match. If the character
is omitted, it is assumed to be
..BR = .
..IP
For string values, the byte string from the
file must match the specified byte string.
The operators =, < and > (but not &) can be applied to strings.
The length used for matching is that of the string argument
in the magic file.
..IP message
The message to be printed if the comparison succeeds. If the string
contains a
..IR printf (3S)
format specification, the value from the file (with any specified masking
performed) is printed using the message as the format string.
..PP
Some file formats contain additional information which is to be printed
along with the file type. A line which begins with the character
..B >
indicates additional tests and messages to be printed. If the test on the
line preceding the first line with a
..B >
succeeds, the tests specified in all the subsequent lines beginning with
..B >
are performed, and the messages printed if the tests succeed. The next
line which does not begin with a
..B >
terminates this.
..SH BUGS
The formats
..I long
and
..I short
are system-dependant; perhaps they should be specified as a number
of bytes (2B, 4B, etc),
since the files being recognized typically come from
a system on which the lengths are invariant.
..PP
There should be more than one level of subtests,
with the level possibly indicated by
the number of
..B >
at the beginning of the line.
..SH SEE ALSO
..IR file (1)
\- the command that reads this file.
..\"
..\" From: guy at sun.uucp (Guy Harris)
..\" Newsgroups: net.bugs.usg
..\" Subject: /etc/magic's format isn't well documented
..\" Message-ID: <2752 at sun.uucp>
..\" Date: 3 Sep 85 08:19:07 GMT
..\" Organization: Sun Microsystems, Inc.
..\" Lines: 136
..\"
..\" Here's a manual page for the format accepted by the "file" made by adding
..\" the changes I posted to the S5R2 version.
..\"
..\" Modified for Ian Darwin's version of the file command.
..\" @(#)$Header: magic.4,v 1.5 87/11/06 20:54:31 ian Exp $
@@@End of magic.4
echo x - magdir/aa 1>&2
cat >magdir/aa <<'@@@End of magdir/aa'
#! file
# Magic data for file(1) command.
# Machine-genererated from src/cmd/file/magdir/*; edit there only!
# Format is described in magic(files), where:
# files is 4 on V7 and BSD, 4 on SV, and ?? in the SVID.
@@@End of magdir/aa
echo x - magdir/aalocal 1>&2
cat >magdir/aalocal <<'@@@End of magdir/aalocal'
# Add any locally-observed files here. Remember:
# text if readable, executable if runnable binary, data if unreadable.
22 short 023000 core dump data
@@@End of magdir/aalocal
echo x - magdir/arc 1>&2
cat >magdir/arc <<'@@@End of magdir/arc'
0 byte 26 'arc' archive
>1 byte 0 (empty)
>1 byte 1 (old format)
@@@End of magdir/arc
echo x - magdir/archive 1>&2
cat >magdir/archive <<'@@@End of magdir/archive'
0 short 070707 cpio archive
0 string 070707 ASCII cpio archive
0 long 0177555 very old archive
0 short 0177555 very old PDP-11 archive
0 long 0177545 old archive
0 short 0177545 old PDP-11 archive
0 long 0100554 apl workspace
0 string =<ar> archive
0 string !<arch> archive
>8 string __.SYMDEF random library
0 string -h- archive (Software Tools format) text
@@@End of magdir/archive
echo x - magdir/c 1>&2
cat >magdir/c <<'@@@End of magdir/c'
# this first will upset you if you're a PL/1 shop...
# in which case rm it; ascmagic will catch real C programs
0 string /* c program text
# check for various C program generators...
# offsets derived empirically, your offsets may vary!
# (this obviously belongs in ascmagic.c/names.h!).
53 string yyprevious c program text (from lex)
@@@End of magdir/c
echo x - magdir/commands 1>&2
cat >magdir/commands <<'@@@End of magdir/commands'
0 string #!\ /bin/sh commands text
0 string #!/bin/sh commands text
0 string #!\ /bin/csh C shell commands text
0 string #!/bin/csh C shell commands text
0 string #!\ /bin/awk awk commands text
0 string #!/bin/awk awk commands text
0 string #!\ / some kinda commands text
0 string #!/ some kinda commands text
0 string #!\ commands text
>3 string >\0 for %s
# An "antique" kernel is either unmodified early V7,
# without DMR's 1979 mod for #!, or any kernel
# derived from a pre-v7 kernel (i.e., System V)
0 string :\ shell archive or commands for antique kernel text
@@@End of magdir/commands
echo x - magdir/compress 1>&2
cat >magdir/compress <<'@@@End of magdir/compress'
0 short 017037 packed data
# CPL - added pack to /etc/magic
0 short 017436 packed data
0 short 0145405 huf output
0 string \037\235 compressed data
# non block compressed
>2 byte 12 - with 12 bits
>2 byte 13 - with 13 bits
>2 byte 14 - with 14 bits
>2 byte 15 - with 15 bits
>2 byte 16 - with 16 bits
# block compressed
>2 byte 140 - with 12 bits
>2 byte 141 - with 13 bits
>2 byte 142 - with 14 bits
>2 byte 143 - with 15 bits
>2 byte 144 - with 16 bits
@@@End of magdir/compress
echo x - magdir/convex 1>&2
cat >magdir/convex <<'@@@End of magdir/convex'
0 long 0513 Convex executable
@@@End of magdir/convex
echo x - magdir/diff 1>&2
cat >magdir/diff <<'@@@End of magdir/diff'
#
# magic file lines for output from "diff"...
0 string diff\ 'diff' output text
0 string ***\ 'diff' output text
0 string Only\ in\ 'diff' output text
0 string Common\ subdirectories:\ 'diff' output text
@@@End of magdir/diff
echo x - magdir/ditroff 1>&2
cat >magdir/ditroff <<'@@@End of magdir/ditroff'
# Magic numbers for ditroff intermediate language
0 string x\ T\ cat titroff output for the C/A/T text
0 string x\ T\ ps titroff output for PostScript
0 string x\ T titroff output text
@@@End of magdir/ditroff
echo x - magdir/fonts 1>&2
cat >magdir/fonts <<'@@@End of magdir/fonts'
0 string FONT ASCII vfont text
0 short 0436 Berkeley vfont data
0 short 017001 byte-swapped Berkeley vfont data
@@@End of magdir/fonts
echo x - magdir/frame 1>&2
cat >magdir/frame <<'@@@End of magdir/frame'
# Magic number for FrameMaker files
# Thanks to Berry Kercheval
#
0 string \<MakerFile FrameMaker document
@@@End of magdir/frame
echo x - magdir/imagen 1>&2
cat >magdir/imagen <<'@@@End of magdir/imagen'
# Tell file about magic for IMAGEN printer-ready files:
0 string @document( Imagen printer
# this only works if "language xxx" is first item in Imagen header.
>10 string language\ impress (imPRESS data)
>10 string language\ daisy (daisywheel text)
>10 string language\ diablo (daisywheel text)
>10 string language\ printer (line printer emulation)
>10 string language\ tektronix (Tektronix 4014 emulation)
# Add any other languages that your Imagen uses - remember
# to keep the word `text' if the file is human-readable.
#
# Now magic for IMAGEN font files...
0 string Rast RST-format raster font data
>45 string >0 face %
@@@End of magdir/imagen
echo x - magdir/intel 1>&2
cat >magdir/intel <<'@@@End of magdir/intel'
# various intel-CPU magic numbers
0 short 01006 80286 executable (STL)
>31 byte <0x040 small model
>31 byte =0x048 large model
>31 byte =0x049 huge model
>16 long >0 not stripped
0 string MZ DOS executable (EXE)
0 string LZ DOS executable (built-in)
0 byte 0xe9 DOS executable (COM)
0 byte 0xeb DOS executable (COM)
0 short =0512 80286 executable small model (COFF)
>12 long >0 not stripped
>22 short >0 - version %ld
0 short =0522 80286 executable large model (COFF)
>12 long >0 not stripped
>22 short >0 - version %ld
0 short =0514 80386 executable
>12 long >0 not stripped
>22 short >0 - version %ld
@@@End of magdir/intel
echo x - magdir/magic 1>&2
cat >magdir/magic <<'@@@End of magdir/magic'
0 string #magic magic text file for file(1) cmd
@@@End of magdir/magic
echo x - magdir/mail.news 1>&2
cat >magdir/mail.news <<'@@@End of magdir/mail.news'
# Unfortunately, saved netnews also has From line added in some news software.
#0 string From mail text
# There are tests to ascmagic.c to cope with mail and news.
0 string Relay-Version: old news text
0 string #!\ rnews batched news text
0 string N#!\ rnews mailed, batched news text
0 string Forward\ to mail forwarding text
0 string Pipe\ to mail piping text
0 string Return-Path: smtp mail text
0 string Path: news text
0 string Xref: news text
0 string From: news or mail text
0 string Article saved news text
@@@End of magdir/mail.news
echo x - magdir/mirage 1>&2
cat >magdir/mirage <<'@@@End of magdir/mirage'
0 long 31415 Mirage Assembler m.out executable
@@@End of magdir/mirage
echo x - magdir/misc 1>&2
cat >magdir/misc <<'@@@End of magdir/misc'
0 string begin uuencoded mail text
@@@End of magdir/misc
echo x - magdir/misc2 1>&2
cat >magdir/misc2 <<'@@@End of magdir/misc2'
# derived empirically, your offsets may vary!
53 string yyprevious c program text (from lex)
@@@End of magdir/misc2
echo x - magdir/olda.out 1>&2
cat >magdir/olda.out <<'@@@End of magdir/olda.out'
0 long 0407 executable
>16 long >0 not stripped
#>2 short >0 - version %ld
0 short 0407 PDP-11 executable
>8 short >0 not stripped
0 short 0401 unix-rt ldp
0 short 0405 old overlay
0 long 0410 pure executable
>16 long >0 not stripped
#>2 short >0 - version %ld
0 short 0410 PDP-11 pure executable
>8 short >0 not stripped
#>2 short >0 - version %ld
0 short 0411 PDP-11 separate I&D executable
>8 short >0 not stripped
#>2 short >0 - version %ld
0 long 0413 demand paged pure executable
>16 long >0 not stripped
#>2 short >0 - version %ld
0 long 0420 demand paged (first page unmapped) pure executable
>16 long >0 not stripped
#>2 short >0 - version %ld
0 short 0437 pdp11 kernel overlay
@@@End of magdir/olda.out
echo x - magdir/postscript 1>&2
cat >magdir/postscript <<'@@@End of magdir/postscript'
#
# Let us not forget PostScript
0 string %! PostScript text
>2 string PS-Adobe- conforming
>11 string 1.0 at level %s
@@@End of magdir/postscript
echo x - magdir/rasterfile 1>&2
cat >magdir/rasterfile <<'@@@End of magdir/rasterfile'
# Sun rasterfiles
0 string \x59\xa6\x6a\x95 rasterfile
>4 long >0 %d
>8 long >0 x %d
>12 long >0 x %d
>20 long 0 old format
>20 long 2 compressed
>24 long 1 with color map
@@@End of magdir/rasterfile
echo x - magdir/sccs 1>&2
cat >magdir/sccs <<'@@@End of magdir/sccs'
# SCCS archive structure:
# \001h01207
# \001s 00276/00000/00000
# \001d D 1.1 87/09/23 08:09:20 ian 1 0
# \001c date and time created 87/09/23 08:09:20 by ian
# \001e
# \001u
# \001U
# ... etc.
# Now '\001h' happens to be the same as the 3B20's a.out magic number (0550).
# *Sigh*. And these both came from various parts of the USG.
# Maybe we should just switch everybody from SCCS to RCS!
# Further, you can't just say '\001h0', because the five-digit number
# is a checksum that could (presumably) have any leading digit,
# and we don't have regular expression matching yet.
# Hence the following official kludge:
8 string \001s\ SCCS archive.
@@@End of magdir/sccs
echo x - magdir/sequent 1>&2
cat >magdir/sequent <<'@@@End of magdir/sequent'
# For Sequent's multiprocessor systems (incomplete).
0 long 000352 BALANCE NS32000 .o
0 long 010352 BALANCE NS32000 executable (0 @ 0)
>16 long >0 not stripped
0 long 020352 BALANCE NS32000 executable (invalid @ 0)
>16 long >0 not stripped
0 long 030352 BALANCE NS32000 standalone executable
>16 long >0 not stripped
# Also need info on Sequent "Symmetry" series...
@@@End of magdir/sequent
echo x - magdir/softquad 1>&2
cat >magdir/softquad <<'@@@End of magdir/softquad'
# SoftQuad troff magic numbers
# SoftQuad @(#)magic 1.2 86/09/15
0 short 0125252 SoftQuad DESC or font file binary
>2 short >0 - version %d
@@@End of magdir/softquad
echo x - magdir/sun 1>&2
cat >magdir/sun <<'@@@End of magdir/sun'
# Values for Sun MC680x0 binaries
0 short 2 mc68020
>2 short 0407 executable
>2 short 0410 pure executable
>2 short 0413 demand paged executable
>16 long >0 not stripped
0 short 1 mc68010
>2 short 0407 executable
>2 short 0410 pure executable
>2 short 0413 demand paged executable
>16 long >0 not stripped
0 short 0 old sun-2
>2 short 0407 executable
>2 short 0410 pure executable
>2 short 0413 demand paged executable
>16 long >0 not stripped
0 long 0x080456 core file
>128 string >0 from '%s'
#
0 short 05401 byte-swapped demand paged executable
0 short 010001 byte-swapped demand paged executable
@@@End of magdir/sun
echo x - magdir/tower 1>&2
cat >magdir/tower <<'@@@End of magdir/tower'
# NCR Tower objects, contributed by
# Michael R. Wayne *** TMC & Associates *** INTERNET: wayne at ford-vax.arpa
# uucp: {philabs | pyramid} !fmsrl7!wayne OR wayne at fmsrl7.UUCP
#
0 short 000610 Tower/XP rel 2 object
>12 long >0 not stripped
>20 short 0407 executable
>20 short 0410 pure executable
>22 short >0 -version %ld
0 short 000615 Tower/XP rel 2 object
>12 long >0 not stripped
>20 short 0407 executable
>20 short 0410 pure executable
>22 short >0 -version %ld
0 short 000620 Tower/XP rel 3 object
>12 long >0 not stripped
>20 short 0407 executable
>20 short 0410 pure executable
>22 short >0 -version %ld
0 short 000625 Tower/XP rel 3 object
>12 long >0 not stripped
>20 short 0407 executable
>20 short 0410 pure executable
>22 short >0 -version %ld
0 short 000630 Tower32/600/400 68020 object
>12 long >0 not stripped
>20 short 0407 executable
>20 short 0410 pure executable
>22 short >0 -version %ld
0 short 000640 Tower32/800 68020
>18 short &020000 w/68881 object
>18 short &040000 compatible object
>18 short &~060000 object
>20 short 0407 executable
>20 short 0413 pure executable
>12 long >0 not stripped
>22 short >0 -version %ld
0 short 000645 Tower32/800 68010
>18 short &040000 compatible object
>18 short &~060000 object
>20 short 0407 executable
>20 short 0413 pure executable
>12 long >0 not stripped
>22 short >0 -version %ld
@@@End of magdir/tower
echo x - magdir/typeset 1>&2
cat >magdir/typeset <<'@@@End of magdir/typeset'
# other typesetting magic
0 string \100\357 very old (C/A/T) troff output data
0 string Interpress/Xerox Xerox InterPress data
@@@End of magdir/typeset
echo x - magdir/varied.out 1>&2
cat >magdir/varied.out <<'@@@End of magdir/varied.out'
# Herewith many of the object file formats used by USG systems.
# The `versions' should be un-commented if they work for you.
0 short 0570 SysV executable
>12 long >0 not stripped
#>22 short >0 - version %ld
0 short 0575 SysV pure executable
>12 long >0 not stripped
#>22 short >0 - version %ld
0 short 0502 basic-16 executable
>12 long >0 not stripped
0 short 0503 basic-16 executable (TV)
>12 long >0 not stripped
0 short 0510 x86 executable
>12 long >0 not stripped
0 short 0511 x86 executable (TV)
>12 long >0 not stripped
0 short 0550 3b20 executable
>12 long >0 not stripped
0 short 0551 3b20 executable (TV)
>12 long >0 not stripped
0 short 0560 WE32000 executable
>12 long >0 not stripped
0 short 0561 WE32000 executable (TV)
>12 long >0 not stripped
0 short 0610 Perkin-Elmer executable
@@@End of magdir/varied.out
echo x - magdir/vax.byteswap 1>&2
cat >magdir/vax.byteswap <<'@@@End of magdir/vax.byteswap'
# Byte-swapped VAXen
# From: dupuy at amsterdam.columbia.edu (Alexander Dupuy)
#
# Here are a few lines you can add to /etc/magic on your sun workstations in
# order to recognize VAX executables and objects.... you could do something
# similar (in reverse) for your vaxen, but since 4.3+NFS' file(1) doesn't look
# for /etc/magic, I've never bothered. It really should be built in to file(1)
# so you would see the state of setuid/setgid/sticky bits. Or actually, there
# should be support for checking that sort of thing in /etc/magic.
#
0 long 00700200000 VAX executable
>16 long &0x7fffffff not stripped
0 long 01000200000 VAX pure executable
>16 long &0x7fffffff not stripped
0 long 01300200000 VAX demand-paged pure executable
>16 long &0x7fffffff not stripped
0 long 01100200000 PDP-11 executable
@@@End of magdir/vax.byteswap
echo x - magdir/xenix 1>&2
cat >magdir/xenix <<'@@@End of magdir/xenix'
# XENIX executable formats: derived empirically; treat as folklore until proven0 short 01006 XENIX (x.out) executable
>8 short 1 Middle model
>16 short >0 not stripped
0 short 02600 XENIX 8086 relocatable or 80286 small model
@@@End of magdir/xenix
echo x - tst/Makefile 1>&2
cat >tst/Makefile <<'@@@End of tst/Makefile'
# Make up some fake test files that are easily produced.
# By no means an exhaustive test!
# @(#) $Header: Makefile,v 1.4 87/11/07 12:46:09 ian Exp $
all: ar cmd emp i t x
ar:
echo '<ar> fake fake fake' >$@
echo 070707 fake fake fake >$@.asc
echo '!<arch>.__.SYMDEF fake fake fake' >$@.ranlib
echo - -h- >$@.swt
cmd:
echo '#! /bin/sh' >$@
echo '#!/bin/sh' >c.sh2
echo '#! /bin/csh' >c.csh1
echo '#!/bin/csh' >c.csh2
echo '#! /bin/awk' >c.awk1
echo '#!/bin/awk' >c.awk2
echo '#! /' >c.misc1
echo '#!/' >c.misc2
echo ': ' >c.broken
emp:
touch $@
i:
echo '@document(language impress)fake fake' >$@
echo '@document(language diablo)fake fake' >$@.d
t:
rm -f $@
tar cvf $@ *
x:
echo 'Interpress/Xerox fake fake fake' >$@
clean:
rm -f [a-z]*
@@@End of tst/Makefile
exit 0
--
For comp.sources.unix stuff, mail to sources at uunet.uu.net.
More information about the Comp.sources.unix
mailing list