English to Phoneme translation
John A. Wasser
wasser_1 at viking.DEC
Thu Mar 28 07:38:50 AEST 1985
Here are the programs you need to translate English text to
phonemes. The translation rules are from a government study
and are probably in the public domain. The rest of the code
was written by me and is now in the public domain. Have fun!
-John A. Wasser
p.s. If you don't like the way it works, you have my permission to
fix it without complaining to me first.
Work address:
ARPAnet: WASSER%VIKING.DEC at decwrl.ARPA
Usenet: {allegra,Shasta,decvax}!decwrl!dec-rhea!dec-viking!wasser
USPS: Digital Equipment Corp.
Mail stop: LJO2/E4
30 Porter Rd
Littleton, MA 01460
-------------------------------------------------------------------------------
Programmers information:
PARSE.C is the main program.
PARSE [infile [outfile]]
It breaks the input file up into words and translates them
individually. If a word is made of digits, the digits
are said individually and a terminating "." is treated as a
decimal point. Words containing both letters and digits
will produce unexpected results. This program is a small
example and can easily be improved apon.
ENGLISH.C contains the English to Phoneme rules.
PHONEME.C contains the translation procedure for each word.
-------------------------------------------------------------------------------
ENGLISH.C
-------------------------------------------------------------------------------
/*
** English to Phoneme rules.
**
** Derived from:
**
** AUTOMATIC TRANSLATION OF ENGLISH TEXT TO PHONETICS
** BY MEANS OF LETTER-TO-SOUND RULES
**
** NRL Report 7948
**
** January 21st, 1976
** Naval Research Laboratory, Washington, D.C.
**
**
** Published by the National Technical Information Service as
** document "AD/A021 929".
**
**
**
** The Phoneme codes:
**
** IY bEEt IH bIt
** EY gAte EH gEt
** AE fAt AA fAther
** AO lAWn OW lOne
** UH fUll UW fOOl
** ER mURdER AX About
** AH bUt AY hIde
** AW hOW OY tOY
**
** p Pack b Back
** t Time d Dime
** k Coat g Goat
** f Fault v Vault
** TH eTHer DH eiTHer
** s Sue z Zoo
** SH leaSH ZH leiSure
** HH How m suM
** n suN NG suNG
** l Laugh w Wear
** y Young r Rate
** CH CHar j Jar
** WH WHere
**
**
** Rules are made up of four parts:
**
** The left context.
** The text to match.
** The right context.
** The phonemes to substitute for the matched text.
**
** Procedure:
**
** Seperate each block of letters (apostrophes included)
** and add a space on each side. For each unmatched
** letter in the word, look through the rules where the
** text to match starts with the letter in the word. If
** the text to match is found and the right and left
** context patterns also match, output the phonemes for
** that rule and skip to the next unmatched letter.
**
**
** Special Context Symbols:
**
** # One or more vowels
** : Zero or more consonants
** ^ One consonant.
** . One of B, D, V, G, J, L, M, N, R, W or Z (voiced
** consonants)
** % One of ER, E, ES, ED, ING, ELY (a suffix)
** (Found in right context only)
** + One of E, I or Y (a "front" vowel)
**
*/
/* Context definitions */
static char Anything[] = ""; /* No context requirement */
static char Nothing[] = " "; /* Context is beginning or end of word */
/* Phoneme definitions */
static char Pause[] = " "; /* Short silence */
static char Silent[] = ""; /* No phonemes */
#define LEFT_PART 0
#define MATCH_PART 1
#define RIGHT_PART 2
#define OUT_PART 3
typedef char *Rule[4]; /* Rule is an array of 4 character pointers */
/*0 = Punctuation */
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule punct_rules[] =
{
{Anything, " ", "'", Silent },
{Anything, " ", Anything, Pause },
{Anything, "-", Anything, Silent },
{".", "'S", Anything, "z" },
{"#:.E", "'S", Anything, "z" },
{"#", "'S", Anything, "z" },
{Anything, "'", Anything, Silent },
{Anything, ",", Anything, Pause },
{Anything, ".", Anything, Pause },
{Anything, "?", Anything, Pause },
{Anything, "!", Anything, Pause },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule A_rules[] =
{
{Anything, "A", Nothing, "AX" },
{Nothing, "ARE", Nothing, "AAr" },
{Nothing, "AR", "O", "AXr" },
{Anything, "AR", "#", "EHr" },
{"^", "AS", "#", "EYs" },
{Anything, "A", "WA", "AX" },
{Anything, "AW", Anything, "AO" },
{" :", "ANY", Anything, "EHnIY" },
{Anything, "A", "^+#", "EY" },
{"#:", "ALLY", Anything, "AXlIY" },
{Nothing, "AL", "#", "AXl" },
{Anything, "AGAIN", Anything, "AXgEHn"},
{"#:", "AG", "E", "IHj" },
{Anything, "A", "^+:#", "AE" },
{" :", "A", "^+ ", "EY" },
{Anything, "A", "^%", "EY" },
{Nothing, "ARR", Anything, "AXr" },
{Anything, "ARR", Anything, "AEr" },
{" :", "AR", Nothing, "AAr" },
{Anything, "AR", Nothing, "ER" },
{Anything, "AR", Anything, "AAr" },
{Anything, "AIR", Anything, "EHr" },
{Anything, "AI", Anything, "EY" },
{Anything, "AY", Anything, "EY" },
{Anything, "AU", Anything, "AO" },
{"#:", "AL", Nothing, "AXl" },
{"#:", "ALS", Nothing, "AXlz" },
{Anything, "ALK", Anything, "AOk" },
{Anything, "AL", "^", "AOl" },
{" :", "ABLE", Anything, "EYbAXl"},
{Anything, "ABLE", Anything, "AXbAXl"},
{Anything, "ANG", "+", "EYnj" },
{Anything, "A", Anything, "AE" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule B_rules[] =
{
{Nothing, "BE", "^#", "bIH" },
{Anything, "BEING", Anything, "bIYIHNG"},
{Nothing, "BOTH", Nothing, "bOWTH" },
{Nothing, "BUS", "#", "bIHz" },
{Anything, "BUIL", Anything, "bIHl" },
{Anything, "B", Anything, "b" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule C_rules[] =
{
{Nothing, "CH", "^", "k" },
{"^E", "CH", Anything, "k" },
{Anything, "CH", Anything, "CH" },
{" S", "CI", "#", "sAY" },
{Anything, "CI", "A", "SH" },
{Anything, "CI", "O", "SH" },
{Anything, "CI", "EN", "SH" },
{Anything, "C", "+", "s" },
{Anything, "CK", Anything, "k" },
{Anything, "COM", "%", "kAHm" },
{Anything, "C", Anything, "k" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule D_rules[] =
{
{"#:", "DED", Nothing, "dIHd" },
{".E", "D", Nothing, "d" },
{"#^:E", "D", Nothing, "t" },
{Nothing, "DE", "^#", "dIH" },
{Nothing, "DO", Nothing, "dUW" },
{Nothing, "DOES", Anything, "dAHz" },
{Nothing, "DOING", Anything, "dUWIHNG"},
{Nothing, "DOW", Anything, "dAW" },
{Anything, "DU", "A", "jUW" },
{Anything, "D", Anything, "d" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule E_rules[] =
{
{"#:", "E", Nothing, Silent },
{"'^:", "E", Nothing, Silent },
{" :", "E", Nothing, "IY" },
{"#", "ED", Nothing, "d" },
{"#:", "E", "D ", Silent },
{Anything, "EV", "ER", "EHv" },
{Anything, "E", "^%", "IY" },
{Anything, "ERI", "#", "IYrIY" },
{Anything, "ERI", Anything, "EHrIH" },
{"#:", "ER", "#", "ER" },
{Anything, "ER", "#", "EHr" },
{Anything, "ER", Anything, "ER" },
{Nothing, "EVEN", Anything, "IYvEHn"},
{"#:", "E", "W", Silent },
{"T", "EW", Anything, "UW" },
{"S", "EW", Anything, "UW" },
{"R", "EW", Anything, "UW" },
{"D", "EW", Anything, "UW" },
{"L", "EW", Anything, "UW" },
{"Z", "EW", Anything, "UW" },
{"N", "EW", Anything, "UW" },
{"J", "EW", Anything, "UW" },
{"TH", "EW", Anything, "UW" },
{"CH", "EW", Anything, "UW" },
{"SH", "EW", Anything, "UW" },
{Anything, "EW", Anything, "YUw" },
{Anything, "E", "O", "IY" },
{"#:S", "ES", Nothing, "IHz" },
{"#:C", "ES", Nothing, "IHz" },
{"#:G", "ES", Nothing, "IHz" },
{"#:Z", "ES", Nothing, "IHz" },
{"#:X", "ES", Nothing, "IHz" },
{"#:J", "ES", Nothing, "IHz" },
{"#:CH", "ES", Nothing, "IHz" },
{"#:SH", "ES", Nothing, "IHz" },
{"#:", "E", "S ", Silent },
{"#:", "ELY", Nothing, "lIY" },
{"#:", "EMENT", Anything, "mEHnt" },
{Anything, "EFUL", Anything, "fUHl" },
{Anything, "EE", Anything, "IY" },
{Anything, "EARN", Anything, "ERn" },
{Nothing, "EAR", "^", "ER" },
{Anything, "EAD", Anything, "EHd" },
{"#:", "EA", Nothing, "IYAX" },
{Anything, "EA", "SU", "EH" },
{Anything, "EA", Anything, "IY" },
{Anything, "EIGH", Anything, "EY" },
{Anything, "EI", Anything, "IY" },
{Nothing, "EYE", Anything, "AY" },
{Anything, "EY", Anything, "IY" },
{Anything, "EU", Anything, "YUw" },
{Anything, "E", Anything, "EH" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule F_rules[] =
{
{Anything, "FUL", Anything, "fUHl" },
{Anything, "F", Anything, "f" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule G_rules[] =
{
{Anything, "GIV", Anything, "gIHv" },
{Nothing, "G", "I^", "g" },
{Anything, "GE", "T", "gEH" },
{"SU", "GGES", Anything, "gjEHs" },
{Anything, "GG", Anything, "g" },
{" B#", "G", Anything, "g" },
{Anything, "G", "+", "j" },
{Anything, "GREAT", Anything, "grEYt" },
{"#", "GH", Anything, Silent },
{Anything, "G", Anything, "g" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule H_rules[] =
{
{Nothing, "HAV", Anything, "hAEv" },
{Nothing, "HERE", Anything, "hIYr" },
{Nothing, "HOUR", Anything, "AWER" },
{Anything, "HOW", Anything, "hAW" },
{Anything, "H", "#", "h" },
{Anything, "H", Anything, Silent },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule I_rules[] =
{
{Nothing, "IN", Anything, "IHn" },
{Nothing, "I", Nothing, "AY" },
{Anything, "IN", "D", "AYn" },
{Anything, "IER", Anything, "IYER" },
{"#:R", "IED", Anything, "IYd" },
{Anything, "IED", Nothing, "AYd" },
{Anything, "IEN", Anything, "IYEHn" },
{Anything, "IE", "T", "AYEH" },
{" :", "I", "%", "AY" },
{Anything, "I", "%", "IY" },
{Anything, "IE", Anything, "IY" },
{Anything, "I", "^+:#", "IH" },
{Anything, "IR", "#", "AYr" },
{Anything, "IZ", "%", "AYz" },
{Anything, "IS", "%", "AYz" },
{Anything, "I", "D%", "AY" },
{"+^", "I", "^+", "IH" },
{Anything, "I", "T%", "AY" },
{"#^:", "I", "^+", "IH" },
{Anything, "I", "^+", "AY" },
{Anything, "IR", Anything, "ER" },
{Anything, "IGH", Anything, "AY" },
{Anything, "ILD", Anything, "AYld" },
{Anything, "IGN", Nothing, "AYn" },
{Anything, "IGN", "^", "AYn" },
{Anything, "IGN", "%", "AYn" },
{Anything, "IQUE", Anything, "IYk" },
{Anything, "I", Anything, "IH" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule J_rules[] =
{
{Anything, "J", Anything, "j" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule K_rules[] =
{
{Nothing, "K", "N", Silent },
{Anything, "K", Anything, "k" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule L_rules[] =
{
{Anything, "LO", "C#", "lOW" },
{"L", "L", Anything, Silent },
{"#^:", "L", "%", "AXl" },
{Anything, "LEAD", Anything, "lIYd" },
{Anything, "L", Anything, "l" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule M_rules[] =
{
{Anything, "MOV", Anything, "mUWv" },
{Anything, "M", Anything, "m" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule N_rules[] =
{
{"E", "NG", "+", "nj" },
{Anything, "NG", "R", "NGg" },
{Anything, "NG", "#", "NGg" },
{Anything, "NGL", "%", "NGgAXl"},
{Anything, "NG", Anything, "NG" },
{Anything, "NK", Anything, "NGk" },
{Nothing, "NOW", Nothing, "nAW" },
{Anything, "N", Anything, "n" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule O_rules[] =
{
{Anything, "OF", Nothing, "AXv" },
{Anything, "OROUGH", Anything, "EROW" },
{"#:", "OR", Nothing, "ER" },
{"#:", "ORS", Nothing, "ERz" },
{Anything, "OR", Anything, "AOr" },
{Nothing, "ONE", Anything, "wAHn" },
{Anything, "OW", Anything, "OW" },
{Nothing, "OVER", Anything, "OWvER" },
{Anything, "OV", Anything, "AHv" },
{Anything, "O", "^%", "OW" },
{Anything, "O", "^EN", "OW" },
{Anything, "O", "^I#", "OW" },
{Anything, "OL", "D", "OWl" },
{Anything, "OUGHT", Anything, "AOt" },
{Anything, "OUGH", Anything, "AHf" },
{Nothing, "OU", Anything, "AW" },
{"H", "OU", "S#", "AW" },
{Anything, "OUS", Anything, "AXs" },
{Anything, "OUR", Anything, "AOr" },
{Anything, "OULD", Anything, "UHd" },
{"^", "OU", "^L", "AH" },
{Anything, "OUP", Anything, "UWp" },
{Anything, "OU", Anything, "AW" },
{Anything, "OY", Anything, "OY" },
{Anything, "OING", Anything, "OWIHNG"},
{Anything, "OI", Anything, "OY" },
{Anything, "OOR", Anything, "AOr" },
{Anything, "OOK", Anything, "UHk" },
{Anything, "OOD", Anything, "UHd" },
{Anything, "OO", Anything, "UW" },
{Anything, "O", "E", "OW" },
{Anything, "O", Nothing, "OW" },
{Anything, "OA", Anything, "OW" },
{Nothing, "ONLY", Anything, "OWnlIY"},
{Nothing, "ONCE", Anything, "wAHns" },
{Anything, "ON'T", Anything, "OWnt" },
{"C", "O", "N", "AA" },
{Anything, "O", "NG", "AO" },
{"^:", "O", "N", "AH" },
{"I", "ON", Anything, "AXn" },
{"#:", "ON", Nothing, "AXn" },
{"#^", "ON", Anything, "AXn" },
{Anything, "O", "ST ", "OW" },
{Anything, "OF", "^", "AOf" },
{Anything, "OTHER", Anything, "AHDHER"},
{Anything, "OSS", Nothing, "AOs" },
{"#^:", "OM", Anything, "AHm" },
{Anything, "O", Anything, "AA" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule P_rules[] =
{
{Anything, "PH", Anything, "f" },
{Anything, "PEOP", Anything, "pIYp" },
{Anything, "POW", Anything, "pAW" },
{Anything, "PUT", Nothing, "pUHt" },
{Anything, "P", Anything, "p" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule Q_rules[] =
{
{Anything, "QUAR", Anything, "kwAOr" },
{Anything, "QU", Anything, "kw" },
{Anything, "Q", Anything, "k" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule R_rules[] =
{
{Nothing, "RE", "^#", "rIY" },
{Anything, "R", Anything, "r" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule S_rules[] =
{
{Anything, "SH", Anything, "SH" },
{"#", "SION", Anything, "ZHAXn" },
{Anything, "SOME", Anything, "sAHm" },
{"#", "SUR", "#", "ZHER" },
{Anything, "SUR", "#", "SHER" },
{"#", "SU", "#", "ZHUW" },
{"#", "SSU", "#", "SHUW" },
{"#", "SED", Nothing, "zd" },
{"#", "S", "#", "z" },
{Anything, "SAID", Anything, "sEHd" },
{"^", "SION", Anything, "SHAXn" },
{Anything, "S", "S", Silent },
{".", "S", Nothing, "z" },
{"#:.E", "S", Nothing, "z" },
{"#^:##", "S", Nothing, "z" },
{"#^:#", "S", Nothing, "s" },
{"U", "S", Nothing, "s" },
{" :#", "S", Nothing, "z" },
{Nothing, "SCH", Anything, "sk" },
{Anything, "S", "C+", Silent },
{"#", "SM", Anything, "zm" },
{"#", "SN", "'", "zAXn" },
{Anything, "S", Anything, "s" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule T_rules[] =
{
{Nothing, "THE", Nothing, "DHAX" },
{Anything, "TO", Nothing, "tUW" },
{Anything, "THAT", Nothing, "DHAEt" },
{Nothing, "THIS", Nothing, "DHIHs" },
{Nothing, "THEY", Anything, "DHEY" },
{Nothing, "THERE", Anything, "DHEHr" },
{Anything, "THER", Anything, "DHER" },
{Anything, "THEIR", Anything, "DHEHr" },
{Nothing, "THAN", Nothing, "DHAEn" },
{Nothing, "THEM", Nothing, "DHEHm" },
{Anything, "THESE", Nothing, "DHIYz" },
{Nothing, "THEN", Anything, "DHEHn" },
{Anything, "THROUGH", Anything, "THrUW" },
{Anything, "THOSE", Anything, "DHOWz" },
{Anything, "THOUGH", Nothing, "DHOW" },
{Nothing, "THUS", Anything, "DHAHs" },
{Anything, "TH", Anything, "TH" },
{"#:", "TED", Nothing, "tIHd" },
{"S", "TI", "#N", "CH" },
{Anything, "TI", "O", "SH" },
{Anything, "TI", "A", "SH" },
{Anything, "TIEN", Anything, "SHAXn" },
{Anything, "TUR", "#", "CHER" },
{Anything, "TU", "A", "CHUW" },
{Nothing, "TWO", Anything, "tUW" },
{Anything, "T", Anything, "t" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule U_rules[] =
{
{Nothing, "UN", "I", "yUWn" },
{Nothing, "UN", Anything, "AHn" },
{Nothing, "UPON", Anything, "AXpAOn"},
{"T", "UR", "#", "UHr" },
{"S", "UR", "#", "UHr" },
{"R", "UR", "#", "UHr" },
{"D", "UR", "#", "UHr" },
{"L", "UR", "#", "UHr" },
{"Z", "UR", "#", "UHr" },
{"N", "UR", "#", "UHr" },
{"J", "UR", "#", "UHr" },
{"TH", "UR", "#", "UHr" },
{"CH", "UR", "#", "UHr" },
{"SH", "UR", "#", "UHr" },
{Anything, "UR", "#", "yUHr" },
{Anything, "UR", Anything, "ER" },
{Anything, "U", "^ ", "AH" },
{Anything, "U^^", Anything, "AH" },
{Anything, "UY", Anything, "AY" },
{" G", "U", "#", Silent },
{"G", "U", "%", Silent },
{"G", "U", "#", "w" },
{"#N", "U", Anything, "YUw" },
{"T", "U", Anything, "UW" },
{"S", "U", Anything, "UW" },
{"R", "U", Anything, "UW" },
{"D", "U", Anything, "UW" },
{"L", "U", Anything, "UW" },
{"Z", "U", Anything, "UW" },
{"N", "U", Anything, "UW" },
{"J", "U", Anything, "UW" },
{"TH", "U", Anything, "UW" },
{"CH", "U", Anything, "UW" },
{"SH", "U", Anything, "UW" },
{Anything, "U", Anything, "YUw" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule V_rules[] =
{
{Anything, "VIEW", Anything, "vYUw" },
{Anything, "V", Anything, "v" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule W_rules[] =
{
{Nothing, "WERE", Anything, "wER" },
{Anything, "WA", "S", "wAA" },
{Anything, "WA", "T", "wAA" },
{Anything, "WERE", Anything, "WHEHr" },
{Anything, "WHAT", Anything, "WHAAt" },
{Anything, "WHOL", Anything, "hOWl" },
{Anything, "WHO", Anything, "hUW" },
{Anything, "WH", Anything, "WH" },
{Anything, "WAR", Anything, "wAOr" },
{Anything, "WOR", "^", "wER" },
{Anything, "WR", Anything, "r" },
{Anything, "W", Anything, "w" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule X_rules[] =
{
{Anything, "X", Anything, "ks" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule Y_rules[] =
{
{Anything, "YOUNG", Anything, "yAHNG" },
{Nothing, "YOU", Anything, "yUW" },
{Nothing, "YES", Anything, "yEHs" },
{Nothing, "Y", Anything, "y" },
{"#^:", "Y", Nothing, "IY" },
{"#^:", "Y", "I", "IY" },
{" :", "Y", Nothing, "AY" },
{" :", "Y", "#", "AY" },
{" :", "Y", "^+:#", "IH" },
{" :", "Y", "^#", "AY" },
{Anything, "Y", Anything, "IH" },
};
/*
** LEFT_PART MATCH_PART RIGHT_PART OUT_PART
*/
static Rule Z_rules[] =
{
{Anything, "Z", Anything, "z" },
};
Rule *Rules[] =
{
punct_rules,
A_rules, B_rules, C_rules, D_rules, E_rules, F_rules, G_rules,
H_rules, I_rules, J_rules, K_rules, L_rules, M_rules, N_rules,
O_rules, P_rules, Q_rules, R_rules, S_rules, T_rules, U_rules,
V_rules, W_rules, X_rules, Y_rules, Z_rules
};
-------------------------------------------------------------------------------
PARSE.C
-------------------------------------------------------------------------------
#include <stdio.h>
#include <ctype.h>
#define UPPER 1
#define LOWER 2
#define NUMBER 4
#define SPECIAL 8
#define MAX_WORD_LENGTH 128
FILE *In_file;
FILE *Out_file;
main(argc, argv)
int argc;
char *argv[];
{
if (argc > 3)
{
fputs("Usage: PHONEME [infile [outfile]]\n", stderr);
exit();
}
if (argc > 1)
{
In_file = fopen(argv[1], "r");
if (In_file == 0)
{
fputs("Error: Cannot open input file.\n", stderr);
exit();
}
}
else
In_file = stdin;
if (argc > 2)
{
Out_file = fopen(argv[2], "w");
if (Out_file == 0)
{
fputs("Error: Cannot create output file.\n", stderr);
exit();
}
}
else
Out_file = stdout;
xlate_file();
}
xlate_file()
{
char buffer[MAX_WORD_LENGTH]; /* Storage for word */
int count; /* number of characters in word */
int type; /* types of characters in word */
int chr;
int terminator; /* Character after word */
for (;;) /* All of the words in the file */
{
type = 0; /* Flags for types of characters */
count = 0;
buffer[count++] = ' '; /* Initial blank */
for (;;) /* All of the characters in the word */
{
chr = getc(In_file);
/* Check for buffer full */
if (count > MAX_WORD_LENGTH - 2)
break;
/* Check for end of word */
if (isspace(chr) || chr == EOF || chr == '.' ||
chr == '?' || chr == '!')
{
terminator = chr;
break;
}
buffer[count++] = toupper(chr);
if (isupper(chr))
type |= UPPER;
else
if (islower(chr) || chr == '\'')
type |= LOWER;
else
if (isdigit(chr))
type |= NUMBER;
else
type |= SPECIAL;
}
buffer[count++] = ' '; /* Terminating blank */
buffer[count++] = '\0'; /* Terminating null */
/* Figure out what it is */
mash_word(buffer, type, terminator);
if (chr == EOF)
return;
}
}
mash_word(word, type, terminator)
char *word;
int type;
int terminator;
{
switch (type)
{
case NUMBER:
for (word++; *word != ' '; word++)
{
switch (*word)
{
case '0': xlate_word(" ZERO "); break;
case '1': xlate_word(" ONE "); break;
case '2': xlate_word(" TWO "); break;
case '3': xlate_word(" THREE "); break;
case '4': xlate_word(" FOUR "); break;
case '5': xlate_word(" FIVE "); break;
case '6': xlate_word(" SIX "); break;
case '7': xlate_word(" SEVEN "); break;
case '8': xlate_word(" EIGHT "); break;
case '9': xlate_word(" NINE "); break;
}
}
if (terminator == '.')
xlate_word(" POINT ");
break;
default:
xlate_word(word);
break;
}
}
-------------------------------------------------------------------------------
PHONEME.C
-------------------------------------------------------------------------------
#include <stdio.h>
#include <ctype.h>
/*
** English to Phoneme translation.
**
** Rules are made up of four parts:
**
** The left context.
** The text to match.
** The right context.
** The phonemes to substitute for the matched text.
**
** Procedure:
**
** Seperate each block of letters (apostrophes included)
** and add a space on each side. For each unmatched
** letter in the word, look through the rules where the
** text to match starts with the letter in the word. If
** the text to match is found and the right and left
** context patterns also match, output the phonemes for
** that rule and skip to the next unmatched letter.
**
**
** Special Context Symbols:
**
** # One or more vowels
** : Zero or more consonants
** ^ One consonant.
** . One of B, D, V, G, J, L, M, N, R, W or Z (voiced
** consonants)
** % One of ER, E, ES, ED, ING, ELY (a suffix)
** (Right context only)
** + One of E, I or Y (a "front" vowel)
*/
typedef char *Rule[4]; /* A rule is four character pointers */
extern Rule *Rules[]; /* An array of pointers to rules */
extern FILE *In_file, *Out_file;
int isvowel(chr)
char chr;
{
return (chr == 'A' || chr == 'E' || chr == 'I' ||
chr == 'O' || chr == 'U');
}
int isconsonant(chr)
char chr;
{
return (isupper(chr) && !isvowel(chr));
}
xlate_word(word)
char word[];
{
int index; /* Current position in word */
int type; /* First letter of match part */
index = 1; /* Skip the initial blank */
do
{
if (isupper(word[index]))
type = word[index] - 'A' + 1;
else
if (word[index] == '\'' || word[index] == ' ')
type = 0;
else
fprintf(stderr,"Bad character in word: '%c'\n", word[index]);
index = find_rule(word, index, Rules[type]);
}
while (word[index] != '\0');
}
find_rule(word, index, rules)
char word[];
int index;
Rule *rules;
{
Rule *rule;
char *left, *match, *right, *output;
int remainder;
for (;;) /* Search for the rule */
{
rule = rules++;
match = (*rule)[1];
for (remainder = index; *match != '\0'; match++, remainder++)
{
if (*match != word[remainder])
break;
}
if (*match != '\0') /* found missmatch */
continue;
left = (*rule)[0];
right = (*rule)[2];
if (!leftmatch(left, &word[index-1]))
continue;
if (!rightmatch(right, &word[remainder]))
continue;
output = (*rule)[3];
fputs(output, Out_file);
return remainder;
}
}
leftmatch(pattern, context)
char *pattern; /* first char of pattern to match in text */
char *context; /* last char of text to be matched */
{
char *pat;
char *text;
if (*pattern == '\0') /* null string matches any context */
return TRUE;
/* point to last character in pattern string */
pat = pattern + (strlen(pattern) - 1);
text = context;
for (; pat != pattern; pat--)
{
/* First check for simple text or space */
if (isalpha(*pat) || *pat == '\'' || *pat == ' ')
if (*pat != *text)
return FALSE;
else
{
text--;
continue;
}
switch (*pat)
{
case '#': /* One or more vowels */
if (!isvowel(*text))
return FALSE;
text--;
while (isvowel(*text))
text--;
break;
case ':': /* Zero or more consonants */
while (isconsonant(*text))
text--;
break;
case '^': /* One consonant */
if (!isconsonant(*text))
return FALSE;
text--;
break;
case '.': /* B, D, V, G, J, L, M, N, R, W, Z */
if (*text != 'B' && *text != 'D' && *text != 'V'
&& *text != 'G' && *text != 'J' && *text != 'L'
&& *text != 'M' && *text != 'N' && *text != 'R'
&& *text != 'W' && *text != 'Z')
return FALSE;
text--;
break;
case '+': /* E, I or Y (front vowel) */
if (*text != 'E' && *text != 'I' && *text != 'Y')
return FALSE;
text--;
break;
case '%':
default:
fprintf(stderr, "Bad char in left rule: '%c'\n", *pat);
return FALSE;
}
}
return TRUE;
}
rightmatch(pattern, context)
char *pattern; /* first char of pattern to match in text */
char *context; /* last char of text to be matched */
{
char *pat;
char *text;
if (*pattern == '\0') /* null string matches any context */
return TRUE;
pat = pattern;
text = context;
for (pat = pattern; *pat != '\0'; pat++)
{
/* First check for simple text or space */
if (isalpha(*pat) || *pat == '\'' || *pat == ' ')
if (*pat != *text)
return FALSE;
else
{
text++;
continue;
}
switch (*pat)
{
case '#': /* One or more vowels */
if (!isvowel(*text))
return FALSE;
text++;
while (isvowel(*text))
text++;
break;
case ':': /* Zero or more consonants */
while (isconsonant(*text))
text++;
break;
case '^': /* One consonant */
if (!isconsonant(*text))
return FALSE;
text++;
break;
case '.': /* B, D, V, G, J, L, M, N, R, W, Z */
if (*text != 'B' && *text != 'D' && *text != 'V'
&& *text != 'G' && *text != 'J' && *text != 'L'
&& *text != 'M' && *text != 'N' && *text != 'R'
&& *text != 'W' && *text != 'Z')
return FALSE;
text++;
break;
case '+': /* E, I or Y (front vowel) */
if (*text != 'E' && *text != 'I' && *text != 'Y')
return FALSE;
text++;
break;
case '%': /* ER, E, ES, ED, ING, ELY (a suffix) */
if (*text == 'E')
{
text++;
if (*text == 'L')
{
text++;
if (*text == 'Y')
{
text++;
break;
}
else
{
text--; /* Don't gobble L */
break;
}
}
else
if (*text == 'R' || *text == 'S'
|| *text == 'D')
text++;
break;
}
else
if (*text == 'I')
{
text++;
if (*text == 'N')
{
text++;
if (*text == 'G')
{
text++;
break;
}
}
return FALSE;
}
else
return FALSE;
default:
fprintf(stderr, "Bad char in right rule:'%c'\n", *pat);
return FALSE;
}
}
return TRUE;
}
-------------------------------------------------------------------------------
End of Source Files
-------------------------------------------------------------------------------
More information about the Comp.sources.unix
mailing list