%{
/* num_to_name.y: Yacc grammer for number-to-words filter. */
          /* ( Programming exercise, 1/94 ) */
/*
 
	usage: num_to_name [-l] [< input] [> output]

	options: l:  use Latin as the language for numerals

Default:  (English)
*/
/* Parsing assumes the number has commas, e.g., 1,102,001 */
/* The lexer (num_to_name.l) ensures this is the case */
/* This is a filter: it passes non-numbers on through, and
	changes numbers to words, e.g
 I saw 121 girls -> I saw one hundred twenty one girls

With the -l option:

 I saw 1344 girls -> I saw mille et trecenti quadraginta quattuor girls   

Bugs:	Won't handle decimals.
	Won't handle numbers at the end of a sentence or with
		other punctuation right after them as numbers.
	Won't handle ordinals, e.g doesn't do 2nd -> second etc
	"Only" goes up to decillions. No undecillions or more :-)
	Latin version: only handles billions and smaller numbers
	Does not supply macrons 
	Declinable noun and adjective numbers are not always properly
        declined i.e., their form is not adapted to context. 

Consult the README for further information relevant to the programming.

*/

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
int yylex(void);  /* scanner generated by lex */
int yyerror(char *); /* parser generated by yacc */

#define VERSION "1.1"
#define ENGLISH 1
#define LATIN 2
#define TRUE 0
#define FALSE 1
#define BUFSIZE 10000   /* Surely overkill */

int indx = 0;

char *(*name)(int);  /* Pointer to namer routine */
char *ename( int triad );  /* Names numbers in range 00-999 in english */
char *lname( int triad );  /* Names numbers in range 00-999 in latin */
char *laname( int triad );  /* Names number adverbs in range 00-999 in latin */
void clearbuf( char *first, int size ); /* does what it says */
void squeeze(char *); /* remove extraneous characters */ 
char *prepend(const char *, char *); /* supplement to string.h */

int start_state = ENGLISH; /* default  */

static char buf[BUFSIZE];          /* number buffer */
static char localbuf[BUFSIZE]; /* used by prepend */
static char temp[BUFSIZ];

/* This is a kludge needed for the switch to muliplicative notation in Latin */
static int retained_centena;
static int retained_millions;
char rollback[BUFSIZE];

/* Used to reset before the next parse of a number */
void reset(void){
	clearbuf(buf,strlen(buf));
	clearbuf(localbuf,strlen(localbuf));
	clearbuf(temp,strlen(temp));
	clearbuf(rollback,strlen(rollback));
	retained_centena = 0;
	retained_millions = 0;
}

/* Building-block English number names used by ename routine */

static char *digit_names[] = { "", " one", " two", " three",
	" four", " five", " six", " seven", " eight", " nine"};

static char *teen_names[] = { " ten", " eleven", " twelve", 
	" thirteen", " fourteen", " fifteen", " sixteen", 
	" seventeen", " eighteen", " nineteen"};

static char *tens_names[] = { " twenty", " thirty", " forty",
	" fifty", " sixty", " seventy", " eighty", " ninety"};

/* Building-block Latin number names */

/* All but the first 3 of these are indeclinable: */

static char *ldigit_names[] = { "", " unus", " duo", " tres",
	" quattuor", " quinque", " sex", " septem", " octo", " novem"};

/* All of these are indeclinable: */

static char *lteen_names[] = { " decem", " undecim", " duodecim", 
	" tredecim", " quattuordecim", " quindecim", " sedecim", 
	" septendecim", " duodeviginti", " undeviginti"};

/* All of these are indeclinable: */

static char *ltens_names[] = { " viginti", " triginta", " quadraginta",
	" quinquaginta", " sexaginta", " septuaginta", " octoginta", " nonaginta"};

/* Centum is indeclibable, the rest are plural 2nd decl. adjectives: */

static char *lhundreds_names[] = {" centum", " ducenti", " trecenti",
" quadringenti", " quingenti", " sescenti", " septingenti", " octingenti", 
" nongenti"}; 

/* All Latin adverb numbers are undeclinable */

static char *ldigit_adverbs[] = {""," semel"," bis", " ter", " quater", 
" quinquies", " sexies", " septies", " octies", " novies"};

static char *lteen_adverbs[] = {" decies", " undecies", " duodecies",
"terdecies"," quaterdecies", " quindecies", " sedecies", " septiesdecies",
" duodevicies", " undevicies"}; 

static char *ltens_adverbs[] = {" vicies", " tricies", " quadragies",
" guinquagies", " sexagies", " septuagies", " octogies", " nonagies"};

/* Except for the first three, I guessed plausible forms for these */
static char *lhundreds_adverbs[] = {" centies", " ducenties", " trecenties",
" quadringenties", " quingenties", " sescenties", 
" septingenties", " octingenties", " nongenties"};   


%}

/* This parser receives a sequence of tokens from the lexer implemented in
num_to_name.l. The lexer assigns values to some of the tokens. Yacc requires
the possible types of these values to be declared in a union as follows: */ 

%union {int i;
	char *str;
	 }

/* Here we must declare all expected tokens, with types if they have one. Two
yacc non-terminals also have types for us - triad and hundreds. These
represent two different aspects of the same thing: 1,2, or 3 consecutive
digits viewed as a natural number or as a string */  

%token COMMA
%token <str> LITERAL
%left <str> WHITESPACE
%right <i> DIGIT
%right <i> EDIGIT
%type <i> triad
%type <str> hundreds
%%

/* See main program in code area below to see how pgm gets started */

/* Grammar productions follow */

/* As for any filter, the first 2 productions parse the input as an array
of zero or more strings of ascii characters separated by newlines */

line_array: /* empty */
 	| line_array  line 
	;

line: text_array '\n'   {squeeze(buf);printf("%s\n",buf);reset();}
	| '\n' {printf("\n");}
	;


text_array:  text_array  text 
	| text_array  number  
	| text_array filler
	| text
	| number  
	| filler
	;

filler: WHITESPACE {squeeze(buf);printf("%s",buf); reset(); 
                      /* print latest number (if any) */
	 	    printf("%s",$1); /* pass through whitespace */}
	;

text:   LITERAL {printf("%s",$1); /* pass through non-numbers */}
	;

/* hundreds reduces when any number in the range 0-999 matches, etc */

number:  hundreds  
	| thousands   
	| millions   
	| billions  
	| trillions 
	| quadrillions
	| quintillions 
	| sextillions 
	| septillions 
	| octillions 
	| nonillions 
	| decillions 
	| gazillions
	;


/* English terminology since the 1800s continues past decillions up to
   centillions (1 centillion = 1 followed by 303 zeros), but we got 
   tired of this. Other options for the proverbial "really big number"
   could include zillions, jillions, ... */

gazillions: triad COMMA gazillions {
			   prepend("?",buf);
	              }
        | triad COMMA decillions {
			   prepend("?",buf);
	              } 
	;  

/* As we build up to a complete number these actions add number words
   to buf. buf is only printed (see above) when a number is completed */

decillions: triad COMMA nonillions { 
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " decillion, ", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
nonillions: triad COMMA octillions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " nonillion, ", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
octillions: triad COMMA septillions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " octillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
septillions: triad COMMA sextillions { 
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " septillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
sextillions: triad COMMA quintillions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " sextillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
quintillions: triad COMMA quadrillions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " quintillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;
quadrillions: triad COMMA trillions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " quadrillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;

trillions: triad COMMA billions {
		       if(start_state == ENGLISH){
		       if( strlen(name($1)) > 0)
			  prepend( " trillion,", buf);
			    prepend( name($1), buf); 
			}
			if(start_state == LATIN)
			   prepend("?",buf);
	}
	;

billions: triad COMMA millions {  
			if(start_state == ENGLISH){
			  if( strlen(name($1)) > 0)
				prepend( " billion,", buf);
			   prepend( name($1), buf); 
			}
			if(start_state == LATIN){
				if($1*10000 + retained_millions*10 +
				  retained_centena > 400000)prepend("?",buf);
				else {
				/* restore stuff before 100000 */
				strcpy(buf,rollback);
				prepend(" centena milia ", buf);
				prepend(laname($1*10000+
				 retained_millions*10 + retained_centena),buf);
				}
			}
			}
	;

millions: triad COMMA thousands {  
			if(start_state == ENGLISH){
			if( strlen(name($1)) > 0)
				prepend( " million,", buf);
		        prepend( name($1), buf); 
			}
			if(start_state == LATIN){

/* Centena milia was considered optional in millions and above. We have opted
to always include it for purposes of normalization */

/* Also note that in this range we need to track the number of hundred
thousands, not the number of millions. Thus we need to reset the parse  */
				 
				retained_millions = $1;
			/* restore stuff below 100000 */
				strcpy(buf,rollback);
				prepend(" centena milia ", buf);
				prepend(laname($1*10 + retained_centena),buf);
			}}
	;

thousands: triad COMMA hundreds {  
	if( strlen(name($1)) > 0){
		if(start_state == ENGLISH){
			prepend( " thousand,", buf);
        		prepend( name($1), buf); 
		}
		if(start_state == LATIN){ 
			 if($1 == 1) prepend(" mille ", buf);
			 if((1 < $1)&&($1 < 100)){
					prepend(" milia ", buf);
        				prepend( lname($1), buf); 
				}
			 strcpy(rollback,buf);
			 if($1 == 100) prepend(" centena milia ", buf);
			 if((100 < $1)&&($1 < 200)){
					prepend("  milia ", buf);
        				prepend( lname($1%100), buf); 
			 	        strcpy(rollback,buf);
					prepend(" centena ac ", buf);
				}
			 if(200 <= $1){
					prepend(" milia ", buf);
					if($1%100)
					  prepend(" milia ", rollback);
					prepend(lname($1%100),rollback);
        				prepend( lname($1), buf); 
				}

/* We need to retain the number of even hundred thousands in case the
number is a million or above because it is  needed for the switch
to multiplicative notation */

			retained_centena = $1/100;
		} /* there are no cases other languages at present */
	}}
	;

hundreds: triad { if( strlen(name($1)) > 0){
		    if(start_state == ENGLISH) 
		       sprintf( buf, "%s,", name($1)); 
		    if(start_state == LATIN){
			if($1 > 1000)
			sprintf(buf, "%s ", lname($1)+1);
			else 
			sprintf(buf,"%s ", lname($1));
			strcpy(rollback,buf);
		    }
		  }
	}
	;

/* English nomenclature, and to a lesser extent Latin, is based on 
grouping digits in blocks of 3 */

triad:  EDIGIT EDIGIT EDIGIT {$$ = 100*$1 + 10*$2 + $3;} 
      | EDIGIT EDIGIT { $$ = 10*$1 + $2; }
      |   EDIGIT {$$ = $1;}
      ;
	
%%

/* From this point onward in a .y file all text is copied literally into
the generated c code */

/* Override the trivial main program provided by yacc */

int main(int argc, char *argv[]) 
{
	char c;
	name = &ename;
	while (--argc > 0 && (*++argv)[0] == '-')
		while ( (c = *++argv[0]))
		switch (c) { /* handle any options */
		case 'v':
			printf("%s\n",VERSION);
			return 0;
		case 'h':
			printf("\nnum_to_name [-h -v -l]\n");
			printf("Filter that finds integers in each line \n");
			printf("and replaces them with words or phrases\n");
			printf("-v: print version number and exit\n");
			printf("-h: print this helpful message\n");
			printf("-l: use Latin instead of English\n\n");
			return 0;
		case 'l':
			start_state = LATIN; 
			name = &lname;
			break;
		default:
			printf("num_to_name: illegal option %c\n", c);
			return 1;
		}
			
	yyparse(); /* Pass control to the yacc parser */
	return 0;
}


char *prepend(const char *first, char *second)

/* Stick first on front of second, return second */

{

	strcpy(temp,first);  /* store first in temp */
	strcat(temp,second); /* tack on second */
	strcpy(second,temp);
	return second;
}

/* returns pointer to an English phrase for a number from 001 - 999 */

char *ename( int triad)
{
	int ones,tens,hundreds; /* digits */

	clearbuf(localbuf,strlen(localbuf));
	
	hundreds = triad/100;
	tens = (triad - hundreds*100)/10;
	ones = (triad - hundreds*100 - tens*10);

	if( tens == 0) sprintf(localbuf,"%s",digit_names[ones]);
	else 
	{    if( tens == 1)
		sprintf(localbuf,"%s",teen_names[ones]);
		else {
			sprintf(localbuf,"%s",digit_names[ones]);
			prepend(tens_names[tens-2],localbuf);
		     }  /* starts with "twenty" ^  */
	}
	if(hundreds !=0) {
		prepend(" hundred", localbuf);
		prepend(digit_names[hundreds],localbuf);
	}
	return localbuf;
}


/* returns pointer to an Latin phrase for a number from 001 - 999 */

/* Naming numbers in this range differs from English in two important ways. 
In English, naming numbers from 101 to 999 essentially reduces to naming 
numbers in the range 1-99 and adding the word "hundred". In Latin, there
are different names for each even hundred. Secondly, in the range 0-99 
subtractive numeration is used for numbers that are one less and two less
than multiples of ten. This pattern does not generalize to higher 
denominations. For example, 800 is not ducenti de mille. (Thank goodness!) */
 
char *lname( int triad)
{
	int ones,tens,hundreds; /* digits */

	clearbuf(localbuf,sizeof(localbuf));
	
	hundreds = triad/100;
	tens = (triad - hundreds*100)/10;
	ones = (triad - hundreds*100 - tens*10);

	if((ones >  0)&&((ones <= 7)||(tens==0))) 
             sprintf(localbuf,"%s",ldigit_names[ones]);
	if( tens != 0 )
	{    if( tens == 1)
		sprintf(localbuf,"%s",lteen_names[ones]);
	     else {
		if(ones == 8)
			sprintf(localbuf," duode%s",ltens_names[tens-1]+1);
		else { if(ones == 9)	
			sprintf(localbuf," unde%s",ltens_names[tens-1]+1);
		        else prepend(ltens_names[tens-2],localbuf);
		}
	     }
	}
	if(hundreds !=0) {
		prepend(lhundreds_names[hundreds-1], localbuf);
	}
	return localbuf;
}
 
/* Returns the name of an adverbial Latin number in the range 1-99999 

This is similar to lname, except that it uses a different
set of arrays giving names of digits, teens, etc. These are
used as "mulitpliers" of 100000 to give Latin names to very large numbers. 
Also note the larger range, which is needed to reach to the tens of  billions. 

*/

char *laname( int t)   /* 0 < t < 1,000,000 */ 
{
	int c,b,a; /* digits */
	int v,u;

/* We divide the 6 digit range of 6 into two ranges of 3 digits, using
 "miliens" as a send-level multiplier as the Romans seemed to do (there 
are few actual examples) t = v miliens u */

	v = t/1000; 
	u = t%1000;
	

	clearbuf(localbuf,strlen(localbuf));
	
	c = u/100; b = (u-100*c)/10; a = (u-100*c - 10*b);	

	if((a!=0)&&(b!=1)) sprintf(localbuf,"%s",ldigit_adverbs[a]);
	if( b != 0 ){
		if( b == 1)
			prepend(lteen_adverbs[a],buf);
	        else 
		{     if(a)prepend(" et ",localbuf); 
		      prepend(ltens_adverbs[b-2],localbuf);
		}
	}
	if(c != 0){
	       if((a!=0)||(b!=0))prepend(" et ",localbuf); 
	       if(c)prepend(lhundreds_adverbs[c-1], localbuf);
	}
	if(v == 0) return localbuf;

	/* else */

	prepend(" milies ",localbuf);

	c = v/100; b = (v-100*c)/10; a = (v-100*c - 10*b);	

	if((a!=0)&&(b!=1)) prepend(ldigit_adverbs[a],localbuf);
	if( b != 0 ){
		if( b == 1)
			prepend(lteen_adverbs[a],buf);
	        else 
		{     if(a)prepend(" et ",localbuf); 
		      prepend(ltens_adverbs[b-2],localbuf);
		}
	}

	if(c != 0){
	       if((a!=0)||(b!=0))prepend(" et ",localbuf); 
	       if(c)prepend(lhundreds_adverbs[c-1], localbuf);
	}
	return localbuf;
}


void clearbuf( char *first, int size )
{
	int i;
	for(i=0;i<size;i++) *(first + i) = '\0';
	return ;
}

/* squeeze: remove the last character and repeated spaces from buf.
Also remove leading space from buf.
Because of slight differences in the way we must handle english and latin
numbers we seem to inevitably end up with a few extra spaces here and
there, and commas at the end */

void squeeze(char *buf){

	int i,j=0,n,gotspace = 1; /* flag */

	n = strlen(buf);
	for(i=0;i<n;i++){
		if(!gotspace || !(buf[i] == ' '))temp[j++]=buf[i];
		if(buf[i]==' ')gotspace = 1;
		else gotspace = 0;
	}
	if(j){
		temp[j-1]='\0';
		strcpy(buf,temp);	
	}
}
