[tex] new version of the ascii-german -> diacritics conversion program
Gustaf Neumann
NEUMANN at awiwuw11.wu-wien.ac.at
Mon Nov 19 14:25:12 AEST 1990
Archive-name: diac/19-Nov-90
Original-posting-by: NEUMANN at awiwuw11.wu-wien.ac.at (Gustaf Neumann)
Original-subject: new version of the ascii-german -> diacritics conversion program
Reposted-by: emv at ox.com (Edward Vielmetti)
[Reposted from comp.text.tex.
Comments on this service to emv at ox.com (Edward Vielmetti).]
Below you will find a new version of the ascii-german to diacritical
german conversion Program. It is still not perfect, but i think it is
pretty good by now. I was able to translate all German words in the book
\bibitem[{Neu88}]{neumann88}
G.~Neumann: \T{Metaprogrammierung und Prolog},
Addison--Wesley, Bonn 1988.
correctly from diacritics -> ascii german -> diactitics.
There are several known problems such as "Masse" ("im hohen Masse" vs.
"Gesteinsmasse" ) and "Busse" ("Autobusse" vs. "tuet Busse"). In both
cases the first varian is assumed to be correct. You can achieve the
other alternatives by writing "Gesteinsmas{}se" and "tuet Bu{}sse"
resp. Other mis-translated words are welcome.
-Gustaf Neumann
-------------------------------------------------------------------
Gustaf Neumann neumann at dec4.wu-wien.ac.at, neumann at awiwuw11.bitnet
Vienna University of Economics and Business Administration
Augasse 2-6, A-1090 Vienna, Austria
Tel: +43 (222) 31-336 x4533 Fax 347-555
------------------------------------- cut here -----diac.shar-----------
# This is a shell archive. Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by neumann on Sun Nov 18 23:20:06 1990
#
# This archive contains:
# diac.l Makefile diacaux.c diacaux.h
#
LANG=""; export LANG
echo x - diac.l
cat >diac.l <<'@EOF'
%{
/* diac.l
* lex file for converting Ascii German into diacritical German
* Version 1.0 written by
* Dorai Sitaram, Rice University, 1990 dorai at titan.rice.edu
*
* Version 1.1:
* General rewrite, using some Material from
* H.Kaeslin, Behandlung der Umlaute bei der Verarbeitung deutscher
* Texte unter Unix, in: it, Vol 1, 1988
* and Duden - die Rechtschreibung.
*
* Gustaf Neumann, Wirtschaftsuniversitaet Wien, October 1990
* neumann at dec4.wu-wien.ac.at neumann at awiwuw11.bitnet
*
* The resulting LaTeX file uses german.sty!
* Representation of umlaut characters: \"a \"A \"o \"O \"u \"U {\ss}
* The style file german.sty would allow "a "A "o "O "u "U "s
* as well, but the latter representation makes it impossible to
* to distinguish between umlaut characters and quoted text. This distinction
* is necessesay in cases where quotes should be changed into opening and
* closing german quotes (\glqq and \qrqq) in an automated way (another
* lex program).
*
* If you do NOT want to use GERMAN.STY, replace underneath the ruleset
* for \documentstyle with the following rule:
\\documentstyle[^\}]*\} { printf("%s\n", yytext);
printf("\\newskip\\zeeskip\n");
printf("\\zeeskip=0pt plus0pt minus0pt\n");
printf("\\def\\1{\\nobreak\\hskip\\zeeskip}\n");
printf("\\let\\umlaut\\\"\n");
printf("\\def\\\"#1{\\1\\umlaut#1\\1}\n");
printf("\\let\\oldss\\ss\n");
printf("\\def\\ss{\\1\\oldss\\1}\n"); }
*
*
* To prevent the conversion from Ascii German into diacritical German,
* it is necessary to insert empty groups into the words (e.g. Ka{}eslin).
*/
#include "diacaux.h"
int i;
%}
%p 6500
%n 1000
%e 2500
%a 4000
%k 2500
%o 3500
V [AEIOUaeiou]
C [B-DF-HJ-NP-TV-Zb-df-hj-np-tv-z]
W [ "'\t\n,;!?().]
b [ \t\n]
%%
\\documentstyle{b}*\{ printf("\\documentstyle[german]{");
\\documentstyle{b}*\[.*german.*\]{b}*\{ ECHO;
\\documentstyle{b}*\[.*\]{b}*\{ {
for(i=13;yytext[i]=='[';i++);
printf("\\documentstyle[german,%s",&yytext[i+2]);}
\\input{b}*\{[^\}]+\} { texfile = getfilenamebrack(&yytext[6]);
tempfile = maketempfilename(texfile);
printf("\\input{%s}", tempfile);
dosubdiac(texfile, tempfile); }
\\input{b}*[^ \t\n]+ { texfile = getfilename(&yytext[6]);
tempfile = maketempfilename(texfile);
printf("\\input %s", tempfile);
dosubdiac(texfile, tempfile); }
\\begin\{.+\} ECHO;
\\end\{.+\} ECHO;
\\[A-Za-z]+ ECHO;
%{ /* ue */
%}
[Rr]euessier printf("%ce\\\"ussier", yytext[0]);
[^igGbB][Ee]ue ECHO;
[QqAa]ue ECHO;
[Uu]e[iu] ECHO;
[Gg]etue{W} ECHO;
[a-rt-z]tuend ECHO;
{W}tuet{W} ECHO;
[Nn]ichtstuend ECHO;
[Nn]ichtstuer ECHO;
Tuerei{W} ECHO;
[a-z]tuerei ECHO;
[a-z]tuerisch ECHO;
[Aa]bzue[b-z][a-z]*[elr]n ECHO;
[Aa]nzue[b-z][a-z]*[elr]n ECHO;
[Aa]u[fs]zue[b-z][a-z]*[elr]n ECHO;
[Ee]inzue[b-z][a-z]*[elr]n ECHO;
[Hh]inzue[b-z][a-z]*[elr]n ECHO;
[Mm]itzue[b-z][a-z]*[elr]n ECHO;
[Nn]achzue[b-z][a-z]*[elr]n ECHO;
[Vv]orzue[b-z][a-z]*[elr]n ECHO;
[Ww]iederzue[b-z][a-z]*[elr]n ECHO;
[Zz]ue[b-z][a-z]*[elr]n ECHO;
[Zz]urueckzue[b-z][a-z]*[elr]n printf("%cur\\\"uckzu%s",yytext[0],&yytext[9]);
tuendere ECHO;
[Aa]biguen ECHO;
[Aa]ffluen ECHO;
[Bb]u[ea]nos ECHO;
[Dd]uett ECHO;
[Dd]uell ECHO;
entuell ECHO;
[Gg]raduell ECHO;
[Gg]uerill ECHO;
[Ii]ndividuen ECHO;
[Ii]nfluen ECHO;
Lueger ECHO;
[krx]tuell ECHO;
[Kk]ongruen ECHO;
[Kk]onstituen ECHO;
[Mm]enuett ECHO;
[Mm]anuell ECHO;
[Mm]igue[tl] ECHO;
[Pp]irouett ECHO;
[Pp]uerto ECHO;
[Rr]esiduen ECHO;
[Ss]tatue ECHO;
[Ss]exuell ECHO;
[Ss]uez ECHO;
[Vv]enezuel ECHO;
[Vv]isuell ECHO;
[Zz]uerkannt ECHO;
[Zz]uerteil ECHO;
[Zz]uerst ECHO;
%{ /* ae */
%}
[Aa]ero ECHO;
[Dd]odekae ECHO;
[Hh]exae ECHO;
[Ii]kosae ECHO;
[Ii]srael ECHO;
[Kk]afkaesk ECHO;
aeuel printf("\\\"auel");
[Mm]ichael ECHO;
[Mm]etae ECHO;
[Oo]ctae ECHO;
[Pp]entae ECHO;
[Pp]harmae ECHO;
[Rr]affael ECHO;
[Rr]afael ECHO;
[Rr]aphael ECHO;
[Tt]etrae ECHO;
[Tt]hemae ECHO;
[Ss]chemae ECHO;
[Ss]amuel ECHO;
[Vv]alue{W} ECHO;
[Tt]rue{W} ECHO;
%{ /* oe */
%}
[Aa]utoe ECHO;
[Bb]enzoe ECHO;
[Cc]hemoe ECHO;
[Dd]iarrhoea ECHO;
[Ee]lektroe ECHO;
[Gg]oethe ECHO;
[Hh]eroen ECHO;
[Hh]o[ml]oe ECHO;
[Hh]ydroe ECHO;
[Ii]ndoeuro ECHO;
Joel ECHO;
[Kk]inoe ECHO;
[Kk]oedukat ECHO;
[Kk]oeffizi ECHO;
[Kk]oerzi ECHO;
[Kk]oexist ECHO;
[Cc]oexist ECHO;
[Kk]oenzym ECHO;
[Kk]ontoe ECHO;
[Ss]oeben ECHO;
Soest ECHO;
[Mm]etazoe ECHO;
[Mm][ai][ck]roe ECHO;
[Mm]onoe ECHO;
[Nn]euroe ECHO;
[Oo]boe ECHO;
[Oo]erlikon ECHO;
[Oo]ldesloe ECHO;
[Oo]kto ECHO;
[Oo]pto ECHO;
[Pp]oesie ECHO;
[Pp]oebene ECHO;
[Pp]iezo ECHO;
[Pp]hoto ECHO;
[Pp]hysioe ECHO;
[Pp]oe[mt]i ECHO;
[Pp]oe[mt][^a-z] ECHO;
[Pp]orto ECHO;
[Pp]roenzy ECHO;
[Pp]roto ECHO;
[Pp]rotozoe ECHO;
[Pp]seudo ECHO;
[Pp]sycho ECHO;
[Pp]yro ECHO;
[Rr]adio ECHO;
[Tt]otoer ECHO;
[Tt]urbo ECHO;
[Vv]ideo ECHO;
%{ /* ss */
%}
{V}sss printf("%c{\\ss}s",yytext[0]);
[EeAu][iu]ss printf("%c%c{\\ss}", yytext[0],yytext[1]);
{C}{V}sser{W} ECHO;
{C}{V}sser{V} ECHO;
{C}{V}ssen ECHO;
[^r]uesse[ln] printf("%c\\\"usse%c",yytext[0],yytext[6]);
luesse printf("l\\\"usse");
iess printf("ie{\\ss}");
ssung ECHO;
ssel ECHO;
ssoren ECHO;
ssiez ECHO;
ccess ECHO;
ssidy ECHO;
chss ECHO;
ssch ECHO;
sspr ECHO;
ssier ECHO;
nisse ECHO;
lss ECHO;
ss' ECHO;
tionss ECHO;
tss ECHO;
ussisch ECHO;
ungss ECHO;
usserl{W} ECHO;
[Aa]ssoz ECHO;
[Aa]ssist ECHO;
[Aa]ssemb ECHO;
[Aa]uss[^e] ECHO;
[Aa]usse[^rn] ECHO;
[Aa]ussende ECHO;
[Ee]sse ECHO;
[Bb]isschen printf("%ci{\\ss}chen", yytext[0]);
[Bb]usiness ECHO;
[Bb]usse ECHO;
[Bb]ussard ECHO;
triebss ECHO;
beitss ECHO;
[Dd]iskussion ECHO;
[Dd]issert ECHO;
[Dd]asselb ECHO;
[Ee]ssi ECHO;
[Ff]lusse ECHO;
[Ff]luess[ie] printf("%cl\\\"uss%c", yytext[0],yytext[6]);
Grass ECHO;
[Gg]enosse ECHO;
[Gg]rosse printf("%cro{\\ss}e",yytext[0]);
[Ii]nteress ECHO;
[Kk]lass[ie] ECHO;
[Kk]assette ECHO;
[Ll]asse ECHO;
[Ll]aessig printf("%c\\\"assig", yytext[0]);
[Mm]assa[^nr] ECHO;
[Mm]asseu ECHO;
[Mm]isser{C} printf("%ci{\\ss}er%c", yytext[0],yytext[6]);
[Mm]iss[ei] ECHO;
[Ee]rmassen printf("%crma{\\ss}en", yytext[0]);
[Mm]assi ECHO;
[Pp]rivatissi ECHO;
[Pp]assiv ECHO;
[Pp]rozessor ECHO;
[Ss]tossen printf("%cto{\\ss}en", yytext[0]);
[Rr]essource ECHO;
[Ww][ia]sse ECHO;
{C}ss{C} ECHO;
[AaOoUu]e printf("\\\"%c", yytext[0]);
ss printf("{\\ss}");
@EOF
chmod 644 diac.l
echo x - Makefile
cat >Makefile <<'@EOF'
#
# if you do not have flex available, deactivate the definitions of
# LEX and LEXLIB; The program compiled with flex works also with the
# standard lex library (-ll).
#
LEX=flex
LEXLIB=-lfl
PROGS= diac
all: ${PROGS}
diac: diac.l diacaux.h diacaux.c
${LEX} ${LFLAGS} diac.l
cc -O ${DEFINES} -o $@ diacaux.c lex.yy.c ${LEXLIB}
strip $@
rm lex.yy.c lex.yy.o diacaux.o
clean:
rm -f ${PROGS} *.o *~ #* core
shar:
shar diac.l Makefile diacaux.c diacaux.h > diac.shar
@EOF
chmod 644 Makefile
echo x - diacaux.c
cat >diacaux.c <<'@EOF'
/* diacaux.c
* to be linked with lex.yy.c from diac.l
* written by Dorai Sitaram, Rice University, 1990
*/
#include "diacaux.h"
int slen(s)
char *s;
{
int i;
for (i = 1; s[i] != '\0'; i++)
;
return i;
}
char *strap(s,t)
char *s,*t;
{
char *r = (char *) malloc(slen(s) + slen(t));
int i,j;
for (i = 0; s[i] != '\0'; i++)
r[i] = s[i];
for (j = 0; t[j] != '\0'; i++, j++)
r[i] = t[j];
r[i] = '\0';
return r;
}
char *getfilename(s)
char *s;
{
char *r = (char *) malloc(slen(s));
int i,j;
for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n'; i++)
;
for (j = 0; s[i] != '\0'; i++, j++)
r[j] = s[i];
r[j] = '\0';
return r;
}
char *getfilenamebrack(s)
char *s;
{
char *r = (char *) malloc(slen(s));
int i,j;
for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n' ||
s[i] == '{'; i++)
;
for (j = 0; s[i] != '}'; i++, j++)
r[j] = s[i];
r[j] = '\0';
return r;
}
char *maketempfilename(s)
char *s;
{
char *r = (char *)malloc(slen(s));
int i,j;
for (i = 0, j = 0; s[j] != '\0'; i++, j++) {
r[i] = s[j];
if (r[i] == '/') r[i] = '_';
}
r[i] = '\0';
return strap("/tmp/",r);
}
void dosubdiac(s,t)
char *s,*t;
{
system(strap("diac <",
strap(texfile,
strap(" > ", tempfile))));
}
@EOF
chmod 644 diacaux.c
echo x - diacaux.h
cat >diacaux.h <<'@EOF'
/* diac.h
* to be included in diac.l and diac.c
* written by Dorai Sitaram, Rice University, 1990
*/
char *texfile;
char *tempfile;
int slen();
char *strap();
char *getfilename();
char *getfilenamebrack();
char *maketempfilename();
void dosubdiac();
@EOF
chmod 644 diacaux.h
exit 0
More information about the Alt.sources
mailing list