polygon intersector: added horizontal line reconstruction
[swftools.git] / lib / gocr / unicode.c
1 /*
2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2007  Joerg Schulenburg
4
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18
19  see README for EMAIL-address
20  */
21  
22 #include "unicode.h"
23 #include <stdio.h>
24
25 /* FIXME jb global */
26 int warn=0; /* if 1 a message is generated if composition is not defined */
27
28 /* Arguments: the character (main), and the modifier (accent, etc). See the
29       function if you want to know the modifiers. 
30    Description: This function intends to be a small helper, to avoid having
31       to write switches in functions. It's therefore mainly to accents, and
32       specially for the most usual ones. It supports the basic greek 
33       characters too, which is actually not very helpful.
34    Returns: the unicode character corresponding to the composed character.
35    
36    ToDo:
37     - It seems to me, that tables should be more effectiv.
38       So we should use tables in future? (js)
39  */
40 wchar_t compose(wchar_t main, wchar_t modifier) {
41 /* supported by now: part of ISO8859-1, basic greek characters */
42   if( main == UNKNOWN || main == PICTURE ) return main;
43 #ifdef DEBUG
44   if(modifier!=UNICODE_NULL && modifier!=SPACE)
45     printf(" compose(%c,%d)",(char)main,(int)modifier);
46 #endif
47   if(main>127 && modifier!=0 && modifier!=SPACE && warn)
48     fprintf(stderr,"# Warning compose %04x + %04x>127\n",
49      (int)modifier,(int)main);
50   switch (modifier) {
51     case UNICODE_NULL:
52     case SPACE:
53         return      (wchar_t)main;
54
55     case APOSTROPHE: /* do NOT USE this. It's here for compatibility only.
56                             Use ACUTE_ACCENT instead. */
57       fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT");
58
59     case ACUTE_ACCENT: /* acute/cedilla */
60       switch (main) {
61         case 'a':           return LATIN_SMALL_LETTER_A_WITH_ACUTE;
62         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_ACUTE;
63         case LATIN_SMALL_LETTER_AE:   return LATIN_SMALL_LETTER_AE_WITH_ACUTE;
64         case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_ACUTE;
65         case 'c':           return LATIN_SMALL_LETTER_C_WITH_ACUTE;
66         case 'C':           return LATIN_CAPITAL_LETTER_C_WITH_ACUTE;
67         case 'e':           return LATIN_SMALL_LETTER_E_WITH_ACUTE;
68         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_ACUTE;
69         case 'g':           return LATIN_SMALL_LETTER_G_WITH_ACUTE;
70         case 'G':           return LATIN_CAPITAL_LETTER_G_WITH_ACUTE;
71         case 'i':           return LATIN_SMALL_LETTER_I_WITH_ACUTE;
72         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_ACUTE;
73         case 'l':           return LATIN_SMALL_LETTER_L_WITH_ACUTE;
74         case 'L':           return LATIN_CAPITAL_LETTER_L_WITH_ACUTE;
75         case 'n':           return LATIN_SMALL_LETTER_N_WITH_ACUTE;
76         case 'N':           return LATIN_CAPITAL_LETTER_N_WITH_ACUTE;
77         case 'o':           return LATIN_SMALL_LETTER_O_WITH_ACUTE;
78         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_ACUTE;
79         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_ACUTE;
80         case 'r':           return LATIN_SMALL_LETTER_R_WITH_ACUTE;
81         case 'R':           return LATIN_CAPITAL_LETTER_R_WITH_ACUTE;
82         case 's':           return LATIN_SMALL_LETTER_S_WITH_ACUTE;
83         case 'S':           return LATIN_CAPITAL_LETTER_S_WITH_ACUTE;
84         case 'u':           return LATIN_SMALL_LETTER_U_WITH_ACUTE;
85         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_ACUTE;
86         case 'y':           return LATIN_SMALL_LETTER_Y_WITH_ACUTE;
87         case 'Y':           return LATIN_CAPITAL_LETTER_Y_WITH_ACUTE;
88         case 'z':           return LATIN_SMALL_LETTER_Z_WITH_ACUTE;
89         case 'Z':           return LATIN_CAPITAL_LETTER_Z_WITH_ACUTE;
90         default:
91           if(warn)fprintf( stderr, " COMPOSE: ACUTE_ACCENT+%04x not defined\n",(int)main);
92       }
93       break;
94
95     case BREVE: /* caron (latin2)  "u"-above-... (small bow) */
96       switch (main) {
97         /* FIXME write separate heuristics for breve */
98         case 'a':           return LATIN_SMALL_LETTER_A_WITH_BREVE;
99         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_BREVE;
100         case 'e':           return LATIN_SMALL_LETTER_E_WITH_BREVE;
101         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_BREVE;
102         case 'g':           return LATIN_SMALL_LETTER_G_WITH_BREVE;
103         case 'G':           return LATIN_CAPITAL_LETTER_G_WITH_BREVE;
104         case 'i':           return LATIN_SMALL_LETTER_I_WITH_BREVE;
105         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_BREVE;
106         case 'o':           return LATIN_SMALL_LETTER_O_WITH_BREVE;
107         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_BREVE;
108         case 'u':           return LATIN_SMALL_LETTER_U_WITH_BREVE;
109         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_BREVE;
110         default:
111           if(warn)fprintf( stderr, " COMPOSE: BREVE+%04x not defined\n",(int)main);
112       }
113       break;
114
115     case CARON: /* caron (latin2)  "v"-above-... */
116       switch (main) {
117         case 'a':           return LATIN_SMALL_LETTER_A_WITH_CARON;
118         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_CARON;
119         case 'c':           return LATIN_SMALL_LETTER_C_WITH_CARON;
120         case 'C':           return LATIN_CAPITAL_LETTER_C_WITH_CARON;
121         case 'e':           return LATIN_SMALL_LETTER_E_WITH_CARON;
122         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_CARON;
123         case 'i':           return LATIN_SMALL_LETTER_I_WITH_CARON;
124         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_CARON;
125         case 'o':           return LATIN_SMALL_LETTER_O_WITH_CARON;
126         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_CARON;
127         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_CARON;
128         case 's':           return LATIN_SMALL_LETTER_S_WITH_CARON;
129         case 'S':           return LATIN_CAPITAL_LETTER_S_WITH_CARON;
130         case 'u':           return LATIN_SMALL_LETTER_U_WITH_CARON;
131         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_CARON;
132         case 'z':           return LATIN_SMALL_LETTER_Z_WITH_CARON;
133         case 'Z':           return LATIN_CAPITAL_LETTER_Z_WITH_CARON;
134         default:
135           if(warn)fprintf( stderr, " COMPOSE: CARON+%04x not defined\n",(int)main);
136       }
137       break;
138
139     case CEDILLA:
140       switch (main) {
141         case 'c':           return LATIN_SMALL_LETTER_C_WITH_CEDILLA;
142         case 'C':           return LATIN_CAPITAL_LETTER_C_WITH_CEDILLA;
143         default:
144           if(warn)fprintf( stderr, " COMPOSE: CEDILLA+%04x not defined\n",(int)main);
145       }
146       break;
147
148     case TILDE:
149       switch (main) {
150         case 'a':           return LATIN_SMALL_LETTER_A_WITH_TILDE;
151         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_TILDE;
152         case 'i':           return LATIN_SMALL_LETTER_I_WITH_TILDE;
153         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_TILDE;
154         case 'n':           return LATIN_SMALL_LETTER_N_WITH_TILDE;
155         case 'N':           return LATIN_CAPITAL_LETTER_N_WITH_TILDE;
156         case 'o':           return LATIN_SMALL_LETTER_O_WITH_TILDE;
157         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_TILDE;
158         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_TILDE;
159         case 'u':           return LATIN_SMALL_LETTER_U_WITH_TILDE;
160         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_TILDE;
161         default:
162           if(warn)fprintf( stderr, " COMPOSE: TILDE+%04x not defined\n",(int)main);
163       }
164       break;
165  
166     case GRAVE_ACCENT:
167       switch (main) {
168         case 'a':           return LATIN_SMALL_LETTER_A_WITH_GRAVE;
169         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_GRAVE;
170         case 'e':           return LATIN_SMALL_LETTER_E_WITH_GRAVE;
171         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_GRAVE;
172         case 'i':           return LATIN_SMALL_LETTER_I_WITH_GRAVE;
173         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_GRAVE;
174         case 'n':           return LATIN_SMALL_LETTER_N_WITH_GRAVE;
175         case 'N':           return LATIN_CAPITAL_LETTER_N_WITH_GRAVE;
176         case 'o':           return LATIN_SMALL_LETTER_O_WITH_GRAVE;
177         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_GRAVE;
178         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_GRAVE;
179         case 'u':           return LATIN_SMALL_LETTER_U_WITH_GRAVE;
180         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_GRAVE;
181         default:
182           if(warn)fprintf( stderr, " COMPOSE: GRAVE_ACCENT+%04x not defined\n",(int)main);
183       }
184       break;
185  
186     case QUOTATION_MARK: /* do NOT USE this. It's here for compatibility only. 
187                             Use DIAERESIS instead. */
188       fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT");
189  
190     case DIAERESIS:
191       switch (main) {
192         case 'a':           return LATIN_SMALL_LETTER_A_WITH_DIAERESIS;
193         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS;
194         case 'e':           return LATIN_SMALL_LETTER_E_WITH_DIAERESIS;
195         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS;
196         case 'i':           return LATIN_SMALL_LETTER_I_WITH_DIAERESIS;
197         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS;
198         case 'o':           return LATIN_SMALL_LETTER_O_WITH_DIAERESIS;
199         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS;
200         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS;
201         case 'u':           return LATIN_SMALL_LETTER_U_WITH_DIAERESIS;
202         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS;
203         case 'y':           return LATIN_SMALL_LETTER_Y_WITH_DIAERESIS;
204         case 'Y':           return LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
205         default:
206           if(warn)fprintf( stderr, " COMPOSE: DIAERESIS+%04x (%c) not defined\n",(int)main,(char)main);
207       }
208       break;
209
210     case CIRCUMFLEX_ACCENT: /* ^ */
211       switch (main) {
212         case 'a':           return LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX;
213         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX;
214         case 'c':           return LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX;
215         case 'C':           return LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX;
216         case 'e':           return LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX;
217         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX;
218         case 'g':           return LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX;
219         case 'G':           return LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX;
220         case 'h':           return LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX;
221         case 'H':           return LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX;
222         case 'i':           return LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX;
223         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX;
224         case 'j':           return LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX;
225         case 'J':           return LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX;
226         case 'o':           return LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX;
227         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX;
228         case '0':           return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX;
229         case 's':           return LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX;
230         case 'S':           return LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX;
231         case 'u':           return LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX;
232         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX;
233         case 'w':           return LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX;
234         case 'W':           return LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX;
235         case 'y':           return LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX;
236         case 'Y':           return LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX;
237         default:
238           if(warn)fprintf( stderr, " COMPOSE: CIRCUMFLEX_ACCENT+%04x not defined\n",(int)main);
239       }
240       break;
241
242     case MACRON: /* a minus sign above the char (latin2) */
243       switch (main) {
244         case 'a':           return LATIN_SMALL_LETTER_A_WITH_MACRON;
245         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_MACRON;
246         case 'e':           return LATIN_SMALL_LETTER_E_WITH_MACRON;
247         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_MACRON;
248         case 'i':           return LATIN_SMALL_LETTER_I_WITH_MACRON;
249         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_MACRON;
250         case 'o':           return LATIN_SMALL_LETTER_O_WITH_MACRON;
251         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_MACRON;
252         case 'u':           return LATIN_SMALL_LETTER_U_WITH_MACRON;
253         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_MACRON;
254         case 'y':           return LATIN_SMALL_LETTER_Y_WITH_MACRON;
255         case 'Y':           return LATIN_CAPITAL_LETTER_Y_WITH_MACRON;
256         case LATIN_SMALL_LETTER_AE:   return LATIN_SMALL_LETTER_AE_WITH_MACRON;
257         case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_MACRON;
258         case '=':           return IDENTICAL_TO;
259         case '-':           return '=';
260         case ' ':           return MODIFIER_LETTER_MACRON;
261         default:
262           if(warn)fprintf( stderr, " COMPOSE: MACRON+%04x not defined\n",(int)main);
263       }
264       break;
265
266     case DOT_ABOVE: /* latin2 */
267       switch (main) {
268         case 'a':           return LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE;
269         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE;
270         case 'c':           return LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE;
271         case 'C':           return LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE;
272         case 'e':           return LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE;
273         case 'E':           return LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE;
274         case 'g':           return LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE;
275         case 'G':           return LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE;
276         case 'l':           return 'i';  /* correct wrong recognition */
277         case 'i':           return 'i';
278         case LATIN_SMALL_LETTER_DOTLESS_I:          return 'i';
279         case 'I':           return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
280         case 'j':           return 'j';
281         case 'o':           return LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE;
282         case 'O':           return LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE;
283         case 'z':           return LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE;
284         case 'Z':           return LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE;
285         case ',':           return ';';
286         case '.':           return ':';
287         default:
288           if(warn)fprintf( stderr, " COMPOSE: DOT_ABOVE+%04x not defined\n",(int)main);
289       }
290       break;
291
292     case RING_ABOVE:
293       switch (main) {
294         case 'a':           return LATIN_SMALL_LETTER_A_WITH_RING_ABOVE;
295         case 'A':           return LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE;
296         case 'u':           return LATIN_SMALL_LETTER_U_WITH_RING_ABOVE;
297         case 'U':           return LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE;
298         default:
299           if(warn)fprintf( stderr, " COMPOSE: RING_ABOVE+%04x not defined\n",(int)main);
300       }
301       break;
302
303     case 'e': /* e ligatures: ae, oe. */
304     case 'E':
305       switch (main) {
306         case 'a':           return LATIN_SMALL_LETTER_AE;
307         case 'A':           return LATIN_CAPITAL_LETTER_AE;
308         case 'o':           return LATIN_SMALL_LIGATURE_OE;
309         case 'O':           return LATIN_CAPITAL_LIGATURE_OE;
310         case '0':           return LATIN_CAPITAL_LIGATURE_OE;
311         default:
312           if(warn)fprintf( stderr, " COMPOSE: %04x+e/E not defined\n",(int)main);
313       }
314       break;
315
316     case 'g': /* greek */
317       switch (main) {
318         /* missing 0x37A-0x390 */
319         /* weird cases: Q -> theta (it resembles a little, doesn't it?)
320                         V -> psi   (what can I do?) */
321         case 'A':   return GREEK_CAPITAL_LETTER_ALPHA;
322         case 'B':   return GREEK_CAPITAL_LETTER_BETA;
323         case 'G':   return GREEK_CAPITAL_LETTER_GAMMA;
324         case 'D':   return GREEK_CAPITAL_LETTER_DELTA;
325         case 'E':   return GREEK_CAPITAL_LETTER_EPSILON;
326         case 'Z':   return GREEK_CAPITAL_LETTER_ZETA;
327         case 'H':   return GREEK_CAPITAL_LETTER_ETA;
328         case 'Q':   return GREEK_CAPITAL_LETTER_THETA;
329         case 'I':   return GREEK_CAPITAL_LETTER_IOTA;
330         case 'K':   return GREEK_CAPITAL_LETTER_KAPPA;
331         case 'L':   return GREEK_CAPITAL_LETTER_LAMDA;
332         case 'M':   return GREEK_CAPITAL_LETTER_MU;
333         case 'N':   return GREEK_CAPITAL_LETTER_NU;
334         case 'X':   return GREEK_CAPITAL_LETTER_XI;
335         case 'O':   return GREEK_CAPITAL_LETTER_OMICRON;
336         case 'P':   return GREEK_CAPITAL_LETTER_PI;
337         case 'R':   return GREEK_CAPITAL_LETTER_RHO;
338         case 'S':   return GREEK_CAPITAL_LETTER_SIGMA;
339         case 'T':   return GREEK_CAPITAL_LETTER_TAU;
340         case 'Y':   return GREEK_CAPITAL_LETTER_UPSILON;
341         case 'F':   return GREEK_CAPITAL_LETTER_PHI;
342         case 'C':   return GREEK_CAPITAL_LETTER_CHI;
343         case 'V':   return GREEK_CAPITAL_LETTER_PSI;
344         case 'W':   return GREEK_CAPITAL_LETTER_OMEGA;
345 /*
346         case '':   return GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA;
347         case '':   return GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA;
348         case '':   return GREEK_SMALL_LETTER_ALPHA_WITH_TONOS;
349         case '':   return GREEK_SMALL_LETTER_EPSILON_WITH_TONOS;
350         case '':   return GREEK_SMALL_LETTER_ETA_WITH_TONOS;
351         case '':   return GREEK_SMALL_LETTER_IOTA_WITH_TONOS;
352         case '':   return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
353 */
354         case 'a':   return GREEK_SMALL_LETTER_ALPHA;
355         case 'b':   return GREEK_SMALL_LETTER_BETA;
356         case 'g':   return GREEK_SMALL_LETTER_GAMMA;
357         case 'd':   return GREEK_SMALL_LETTER_DELTA;
358         case 'e':   return GREEK_SMALL_LETTER_EPSILON;
359         case 'z':   return GREEK_SMALL_LETTER_ZETA;
360         case 'h':   return GREEK_SMALL_LETTER_ETA;
361         case 'q':   return GREEK_SMALL_LETTER_THETA;
362         case 'i':   return GREEK_SMALL_LETTER_IOTA;
363         case 'k':   return GREEK_SMALL_LETTER_KAPPA;
364         case 'l':   return GREEK_SMALL_LETTER_LAMDA;
365         case 'm':   return GREEK_SMALL_LETTER_MU;
366         case 'n':   return GREEK_SMALL_LETTER_NU;
367         case 'x':   return GREEK_SMALL_LETTER_XI;
368         case 'o':   return GREEK_SMALL_LETTER_OMICRON;
369         case 'p':   return GREEK_SMALL_LETTER_PI;
370         case 'r':   return GREEK_SMALL_LETTER_RHO;
371         case '&':   return GREEK_SMALL_LETTER_FINAL_SIGMA;
372         case 's':   return GREEK_SMALL_LETTER_SIGMA;
373         case 't':   return GREEK_SMALL_LETTER_TAU;
374         case 'y':   return GREEK_SMALL_LETTER_UPSILON;
375         case 'f':   return GREEK_SMALL_LETTER_PHI;
376         case 'c':   return GREEK_SMALL_LETTER_CHI;
377         case 'v':   return GREEK_SMALL_LETTER_PSI;
378         case 'w':   return GREEK_SMALL_LETTER_OMEGA;
379 /*
380         case '':   return GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA;
381         case '':   return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA;
382         case '':   return GREEK_SMALL_LETTER_OMICRON_WITH_TONOS;
383         case '':   return GREEK_SMALL_LETTER_UPSILON_WITH_TONOS;
384         case '':   return GREEK_SMALL_LETTER_OMEGA_WITH_TONOS;
385         case '':   return GREEK_BETA_SYMBOL;
386         case '':   return GREEK_THETA_SYMBOL;
387         case '':   return GREEK_UPSILON_WITH_HOOK_SYMBOL;
388         case '':   return GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL;
389         case '':   return GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL;
390         case '':   return GREEK_PHI_SYMBOL;
391         case '':   return GREEK_PI_SYMBOL;
392 */
393         default:
394           if(warn)fprintf( stderr, " COMPOSE: GREEK %04x not defined\n",(int)main);
395       }
396       break;   
397
398     default:
399       fprintf( stderr, " COMPOSE: modifier %04x not defined\n",(int)modifier);
400   }
401   return (wchar_t)main;
402 }
403
404 #define UNDEFINED                       "~"
405
406 /* Arguments: character in Unicode format, type of format to convert to.
407    Returns: a string containing the Unicode character converted to the chosen
408     format. This string is statically allocated and should not be freed.
409    ToDo: better using tables?
410  */
411 const char *decode(wchar_t c, FORMAT type) {
412   /* static char d;  --- js: big bug (missing \0) if &d returned */
413   /*FIXME jb static*/ static char bbuf[8*32]; /* space for 8 buffers, rotating */
414   /*FIXME jb static*/ static char *buf=bbuf;  /* used for UTF8 sequences and undefined codes */
415   buf+=32; if(buf>=bbuf+8*32) buf=bbuf;
416   buf[0]=buf[1]=buf[2]=0;
417   switch (type) {
418     case ISO8859_1:
419       if ( c <= 0xFF ) { /* UNICODE == ISO8859-1 */
420         buf[0] = (char)c;
421         return buf;
422       }
423       switch (c) { /* not found in list, but perhaps we can describe it */
424         /* todo: add greek. GREEK_SMALL_LETTER_ALPHA = alpha */
425         
426         /* general puctuation */
427         case HYPHEN:
428           return (const char *)"-";
429         case FIGURE_DASH:
430         case EN_DASH:
431           return (const char *)"--";
432         case EM_DASH:
433           return (const char *)"---";
434         case LEFT_SINGLE_QUOTATION_MARK:
435           return (const char *)"`";
436         case RIGHT_SINGLE_QUOTATION_MARK:
437           return (const char *)"'";
438         case SINGLE_LOW_9_QUOTATION_MARK:
439           return (const char *)",";
440         case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK:
441           return (const char *)UNDEFINED;
442         case LEFT_DOUBLE_QUOTATION_MARK:
443           return (const char *)"``";
444         case RIGHT_DOUBLE_QUOTATION_MARK:
445           return (const char *)"''";
446         case DOUBLE_LOW_9_QUOTATION_MARK:
447           return (const char *)",,";
448         case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK:
449           return (const char *)UNDEFINED;
450         case DAGGER:
451           return (const char *)"+";
452         case DOUBLE_DAGGER:
453           return (const char *)"*";
454         case BULLET:
455           return (const char *)"*";
456         case TRIANGULAR_BULLET:
457           return (const char *)"*";
458         case HYPHENATION_POINT:
459           return (const char *)"-";
460         case HORIZONTAL_ELLIPSIS:
461           return (const char *)"...";
462         case PER_MILLE_SIGN:
463           return (const char *)"%%"; /* awk! */
464         case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK:
465           return (const char *)"<";
466         case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK:
467           return (const char *)">";
468         case EURO_CURRENCY_SIGN:
469           return (const char *)"EUR"; /* change it! */
470         
471         /* ligatures */
472         case LATIN_SMALL_LIGATURE_FF:
473           return (const char *)"ff";
474         case LATIN_SMALL_LIGATURE_FI:
475           return (const char *)"fi";
476         case LATIN_SMALL_LIGATURE_FL:
477           return (const char *)"fl";
478         case LATIN_SMALL_LIGATURE_FFI:
479           return (const char *)"ffi";
480         case LATIN_SMALL_LIGATURE_FFL:
481           return (const char *)"ffl";
482         case LATIN_SMALL_LIGATURE_LONG_S_T:
483         case LATIN_SMALL_LIGATURE_ST:
484           return (const char *)"st";
485         
486         /* extra */
487         case UNKNOWN:
488           return (const char *)"_";
489         case PICTURE:
490           return (const char *)"_"; /* Due to Mobile OCR */
491                 
492         default:
493           /* snprintf seems to be no standard, so I use insecure sprintf */
494           sprintf(buf,"\\code(%04x)",(unsigned)c);
495           return buf;  /* UNDEFINED; */
496       }
497       break;
498     case TeX:
499       if ( c >= SPACE && c <= TILDE ) { /* ASCII */
500         switch (c) {
501           case '$':
502               return (const char *)"\\$";
503           case '&':
504               return (const char *)"\\&";
505           case '%':
506               return (const char *)"\\%";
507           case '#':
508               return (const char *)"\\#";
509           case '_':
510               return (const char *)"\\_";
511           case '{':
512               return (const char *)"\\{";
513           case '}':
514               return (const char *)"\\}";
515           case '\\':
516               return (const char *)"$\\backslash$";
517           case '~':
518               return (const char *)"\\~{}";
519           case '^':
520               return (const char *)"\\^{}";
521           default:
522               buf[0] = (char)c;
523               return (const char *)buf;
524         }
525       }
526       switch (c) {
527         /* ISO8859_1 */
528         case NO_BREAK_SPACE:
529           return (const char *)"~";
530         case INVERTED_EXCLAMATION_MARK:
531           return (const char *)"!'";
532         case CENT_SIGN:
533           return (const char *)"\\textcent"; /* \usepackage{textcomp} */
534         case POUND_SIGN:
535           return (const char *)"\\pounds";
536         case EURO_CURRENCY_SIGN:
537           return (const char *)"\\euro"; /* \usepackage{eurosans} */
538         case CURRENCY_SIGN:
539           return (const char *)"\\textcurrency"; /* \usepackage{textcomp} */
540         case YEN_SIGN:
541           return (const char *)"\\textyen"; /* \usepackage{textcomp} */
542         case BROKEN_BAR:
543           return (const char *)"\\textbrokenbar"; /* \usepackage{textcomp} */
544         case SECTION_SIGN:
545           return (const char *)"\\S";
546         case DIAERESIS:
547           return (const char *)"\"";
548         case COPYRIGHT_SIGN:
549           return (const char *)"\\copyright";
550         case FEMININE_ORDINAL_INDICATOR:
551           return (const char *)"$^{\\underbar{a}}$";
552         case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
553           return (const char *)"\\flqq{}";
554         case NOT_SIGN:
555           return (const char *)"$\\lnot$";
556         case SOFT_HYPHEN:
557           return (const char *)"\\-";
558         case REGISTERED_SIGN:
559           return (const char *)"\\textregistered";/* \usepackage{textcomp} */
560         case MACRON:
561           return (const char *)"\\textasciimacron";/* \usepackage{textcomp} */
562         case DEGREE_SIGN:
563           return (const char *)"$^{o}$";
564         case PLUS_MINUS_SIGN:
565           return (const char *)"$\\pm$";
566         case SUPERSCRIPT_TWO:
567           return (const char *)"$^{2}$";
568         case SUPERSCRIPT_THREE:
569           return (const char *)"$^{3}$";
570         case ACUTE_ACCENT:
571           return (const char *)"\\( \\prime \\)";
572         case MICRO_SIGN:
573           return (const char *)"$\\mu$";
574         case PILCROW_SIGN:
575           return (const char *)"\\P";
576         case MIDDLE_DOT:
577           return (const char *)"$\\cdot$";
578         case CEDILLA:
579           return (const char *)"\\,";
580         case SUPERSCRIPT_ONE:
581           return (const char *)"$^{1}$";
582         case MASCULINE_ORDINAL_INDICATOR:
583           return (const char *)"$^{\\underbar{o}}$";
584         case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
585           return (const char *)"\\frqq{}";
586         case VULGAR_FRACTION_ONE_QUARTER:        /* these fractions are not good*/
587           return (const char *)"\\( 1\\over 4 \\)";
588         case VULGAR_FRACTION_ONE_HALF:
589           return (const char *)"\\( 1\\over 2 \\)";
590         case VULGAR_FRACTION_THREE_QUARTERS:
591           return (const char *)"\\( 3\\over 4 \\)";
592         case INVERTED_QUESTION_MARK:
593           return (const char *)"?'";
594         case LATIN_CAPITAL_LETTER_A_WITH_GRAVE:
595           return (const char *)"\\`A";
596         case LATIN_CAPITAL_LETTER_A_WITH_ACUTE:
597           return (const char *)"\\'A";
598         case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX:
599           return (const char *)"\\^A";
600         case LATIN_CAPITAL_LETTER_A_WITH_TILDE:
601           return (const char *)"\\~A";
602         case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS:
603           return (const char *)"\\\"A";
604         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
605           return (const char *)"\\AA";
606         case LATIN_CAPITAL_LETTER_AE:
607           return (const char *)"\\AE";
608         case LATIN_CAPITAL_LETTER_C_WITH_CARON:
609           return (const char *)"\\v{C}";
610         case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA:
611           return (const char *)"\\C";
612         case LATIN_CAPITAL_LETTER_E_WITH_GRAVE:
613           return (const char *)"\\`E";
614         case LATIN_CAPITAL_LETTER_E_WITH_ACUTE:
615           return (const char *)"\\'E";
616         case LATIN_CAPITAL_LETTER_E_WITH_CARON:
617           return (const char *)"\\v{E}";
618         case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX:
619           return (const char *)"\\^E";
620         case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS:
621           return (const char *)"\\\"E";
622         case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
623           return (const char *)"\\`I";
624         case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
625           return (const char *)"\\'I";
626         case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX:
627           return (const char *)"\\^I";
628         case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS:
629           return (const char *)"\\\"I";
630         case LATIN_CAPITAL_LETTER_ETH:
631           return (const char *)UNDEFINED;
632         case LATIN_CAPITAL_LETTER_N_WITH_TILDE:
633           return (const char *)"\\~N";
634         case LATIN_CAPITAL_LETTER_O_WITH_GRAVE:
635           return (const char *)"\\`O";
636         case LATIN_CAPITAL_LETTER_O_WITH_ACUTE:
637           return (const char *)"\\'O";
638         case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX:
639           return (const char *)"\\^O";
640         case LATIN_CAPITAL_LETTER_O_WITH_TILDE:
641           return (const char *)"\\~O";
642         case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS:
643           return (const char *)"\\\"O";
644         case MULTIPLICATION_SIGN:
645           return (const char *)"$\\times$";
646         case LATIN_CAPITAL_LETTER_O_WITH_STROKE:
647           return (const char *)"\\O";
648         case LATIN_CAPITAL_LETTER_S_WITH_CARON:
649           return (const char *)"\\v{S}";
650         case LATIN_CAPITAL_LETTER_U_WITH_GRAVE:
651           return (const char *)"\\`U";
652         case LATIN_CAPITAL_LETTER_U_WITH_ACUTE:
653           return (const char *)"\\'U";
654         case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX:
655           return (const char *)"\\^U";
656         case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS:
657           return (const char *)"\\\"U";
658         case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE:
659           return (const char *)"\\'Y";
660         case LATIN_CAPITAL_LETTER_Z_WITH_CARON:
661           return (const char *)"\\v{Z}";
662         case LATIN_CAPITAL_LETTER_THORN:
663           return (const char *)UNDEFINED;
664         case LATIN_SMALL_LETTER_SHARP_S:
665           return (const char *)"\\ss";
666         case LATIN_SMALL_LETTER_A_WITH_GRAVE:
667           return (const char *)"\\`a";
668         case LATIN_SMALL_LETTER_A_WITH_ACUTE:
669           return (const char *)"\\'a";
670         case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX:
671           return (const char *)"\\^a";
672         case LATIN_SMALL_LETTER_A_WITH_TILDE:
673           return (const char *)"\\~a";
674         case LATIN_SMALL_LETTER_A_WITH_DIAERESIS:
675           return (const char *)"\\\"a";
676         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
677           return (const char *)"\\aa";
678         case LATIN_SMALL_LETTER_AE:
679           return (const char *)"\\ae";
680         case LATIN_SMALL_LETTER_C_WITH_CARON:
681           return (const char *)"\\v{c}";
682         case LATIN_SMALL_LETTER_C_WITH_CEDILLA:
683           return (const char *)"\\c";
684         case LATIN_SMALL_LETTER_E_WITH_GRAVE:
685           return (const char *)"\\`e";
686         case LATIN_SMALL_LETTER_E_WITH_ACUTE:
687           return (const char *)"\\'e";
688         case LATIN_SMALL_LETTER_E_WITH_CARON:
689           return (const char *)"\\v{e}";
690         case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX:
691           return (const char *)"\\^e";
692         case LATIN_SMALL_LETTER_E_WITH_DIAERESIS:
693           return (const char *)"\\\"e";
694         case LATIN_SMALL_LETTER_I_WITH_GRAVE:
695           return (const char *)"\\`i";
696         case LATIN_SMALL_LETTER_I_WITH_ACUTE:
697           return (const char *)"\\'i";
698         case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX:
699           return (const char *)"\\^i";
700         case LATIN_SMALL_LETTER_I_WITH_DIAERESIS:
701           return (const char *)"\\\"i";
702         case LATIN_SMALL_LETTER_ETH:
703           return (const char *)UNDEFINED;
704         case LATIN_SMALL_LETTER_N_WITH_TILDE:
705           return (const char *)"\\~n";
706         case LATIN_SMALL_LETTER_O_WITH_GRAVE:
707           return (const char *)"\\`o";
708         case LATIN_SMALL_LETTER_O_WITH_ACUTE:
709           return (const char *)"\\'o";
710         case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX:
711           return (const char *)"\\^o";
712         case LATIN_SMALL_LETTER_O_WITH_TILDE:
713           return (const char *)"\\~o";
714         case LATIN_SMALL_LETTER_O_WITH_DIAERESIS:
715           return (const char *)"\\\"o";
716         case DIVISION_SIGN:
717           return (const char *)"$\\div$";
718         case LATIN_SMALL_LETTER_O_WITH_STROKE:
719           return (const char *)"\\o";
720         case LATIN_SMALL_LETTER_S_WITH_CARON:
721           return (const char *)"\\v{s}";
722         case LATIN_SMALL_LETTER_U_WITH_GRAVE:
723           return (const char *)"\\`u";
724         case LATIN_SMALL_LETTER_U_WITH_ACUTE:
725           return (const char *)"\\'u";
726         case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX:
727           return (const char *)"\\^u";
728         case LATIN_SMALL_LETTER_U_WITH_DIAERESIS:
729           return (const char *)"\\\"u";
730         case LATIN_SMALL_LETTER_Y_WITH_ACUTE:
731           return (const char *)"\\'y";
732         case LATIN_SMALL_LETTER_THORN:
733           return (const char *)UNDEFINED;
734         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
735           return (const char *)"\\\"y";
736         case LATIN_SMALL_LETTER_Z_WITH_CARON:
737           return (const char *)"\\v{z}";
738
739         /* greek */
740           /* some (punctuation, accents, accented capital) greek letters missing*/
741         case GREEK_CAPITAL_LETTER_ALPHA:
742           return (const char *)"A";
743         case GREEK_CAPITAL_LETTER_BETA:
744           return (const char *)"B";
745         case GREEK_CAPITAL_LETTER_GAMMA:
746           return (const char *)"\\( \\Gamma \\)";
747         case GREEK_CAPITAL_LETTER_DELTA:
748           return (const char *)"\\( \\Delta \\)";
749         case GREEK_CAPITAL_LETTER_EPSILON:
750           return (const char *)"E";
751         case GREEK_CAPITAL_LETTER_ZETA:
752           return (const char *)"Z";
753         case GREEK_CAPITAL_LETTER_ETA:
754           return (const char *)"H";
755         case GREEK_CAPITAL_LETTER_THETA:
756           return (const char *)"\\( \\Theta \\)";
757         case GREEK_CAPITAL_LETTER_IOTA:
758           return (const char *)"I";
759         case GREEK_CAPITAL_LETTER_KAPPA:
760           return (const char *)"K";
761         case GREEK_CAPITAL_LETTER_LAMDA:
762           return (const char *)"\\( \\Lambda \\)";
763         case GREEK_CAPITAL_LETTER_MU:
764           return (const char *)"M";
765         case GREEK_CAPITAL_LETTER_NU:
766           return (const char *)"N";
767         case GREEK_CAPITAL_LETTER_XI:
768           return (const char *)"\\( \\Xi \\)";
769         case GREEK_CAPITAL_LETTER_OMICRON:
770           return (const char *)"O";
771         case GREEK_CAPITAL_LETTER_PI:
772           return (const char *)"\\( \\Pi \\)";
773         case GREEK_CAPITAL_LETTER_RHO:
774           return (const char *)"P";
775         case GREEK_CAPITAL_LETTER_SIGMA:
776           return (const char *)"\\( \\Sigma \\)";
777         case GREEK_CAPITAL_LETTER_TAU:
778           return (const char *)"T";
779         case GREEK_CAPITAL_LETTER_UPSILON:
780           return (const char *)"\\( \\Upsilon \\)";
781         case GREEK_CAPITAL_LETTER_PHI:
782           return (const char *)"\\( \\Phi \\)";
783         case GREEK_CAPITAL_LETTER_CHI:
784           return (const char *)"\\( \\Chi \\)";
785         case GREEK_CAPITAL_LETTER_PSI:
786           return (const char *)"\\( \\Psi \\)";
787         case GREEK_CAPITAL_LETTER_OMEGA:
788           return (const char *)"\\( \\Omega \\)";
789         case GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA:
790           return (const char *)UNDEFINED;
791         case GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA:
792           return (const char *)UNDEFINED;
793         case GREEK_SMALL_LETTER_ALPHA_WITH_TONOS:
794           return (const char *)UNDEFINED;
795         case GREEK_SMALL_LETTER_EPSILON_WITH_TONOS:
796           return (const char *)UNDEFINED;
797         case GREEK_SMALL_LETTER_ETA_WITH_TONOS:
798           return (const char *)UNDEFINED;
799         case GREEK_SMALL_LETTER_IOTA_WITH_TONOS:
800           return (const char *)UNDEFINED;
801         case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
802           return (const char *)UNDEFINED;
803         case GREEK_SMALL_LETTER_ALPHA:
804           return (const char *)"\\( \\alpha \\)";
805         case GREEK_SMALL_LETTER_BETA:
806           return (const char *)"\\( \\beta \\)";
807         case GREEK_SMALL_LETTER_GAMMA:
808           return (const char *)"\\( \\gamma \\)";
809         case GREEK_SMALL_LETTER_DELTA:
810           return (const char *)"\\( \\delta \\)";
811         case GREEK_SMALL_LETTER_EPSILON:
812           return (const char *)"\\( \\epsilon \\)";
813         case GREEK_SMALL_LETTER_ZETA:
814           return (const char *)"\\( \\zeta \\)";
815         case GREEK_SMALL_LETTER_ETA:
816           return (const char *)"\\( \\eta \\)";
817         case GREEK_SMALL_LETTER_THETA:
818           return (const char *)"\\( \\theta \\)";
819         case GREEK_SMALL_LETTER_IOTA:
820           return (const char *)"\\( \\iota \\)";
821         case GREEK_SMALL_LETTER_KAPPA:
822           return (const char *)"\\( \\kappa \\)";
823         case GREEK_SMALL_LETTER_LAMDA:
824           return (const char *)"\\( \\lambda \\)";
825         case GREEK_SMALL_LETTER_MU:
826           return (const char *)"\\( \\mu \\)";
827         case GREEK_SMALL_LETTER_NU:
828           return (const char *)"\\( \\nu \\)";
829         case GREEK_SMALL_LETTER_XI:
830           return (const char *)"\\( \\xi \\)";
831         case GREEK_SMALL_LETTER_OMICRON:
832           return (const char *)"\\( \\omicron \\)";
833         case GREEK_SMALL_LETTER_PI:
834           return (const char *)"\\( \\pi \\)";
835         case GREEK_SMALL_LETTER_RHO:
836           return (const char *)"\\( \\rho \\)";
837         case GREEK_SMALL_LETTER_FINAL_SIGMA:
838           return (const char *)"\\( \\varsigma \\)";
839         case GREEK_SMALL_LETTER_SIGMA:
840           return (const char *)"\\( \\sigma \\)";
841         case GREEK_SMALL_LETTER_TAU:
842           return (const char *)"\\( \\tau \\)";
843         case GREEK_SMALL_LETTER_UPSILON:
844           return (const char *)"\\( \\upsilon \\)";
845         case GREEK_SMALL_LETTER_PHI:
846           return (const char *)"\\( \\varphi \\)";
847         case GREEK_SMALL_LETTER_CHI:
848           return (const char *)"\\( \\chi \\)";
849         case GREEK_SMALL_LETTER_PSI:
850           return (const char *)"\\( \\psi \\)";
851         case GREEK_SMALL_LETTER_OMEGA:
852           return (const char *)"\\( \\omega \\)";
853         case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA:
854           return (const char *)UNDEFINED;
855         case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA:
856           return (const char *)UNDEFINED;
857         case GREEK_SMALL_LETTER_OMICRON_WITH_TONOS:
858           return (const char *)UNDEFINED;
859         case GREEK_SMALL_LETTER_UPSILON_WITH_TONOS:
860           return (const char *)UNDEFINED;
861         case GREEK_SMALL_LETTER_OMEGA_WITH_TONOS:
862           return (const char *)UNDEFINED;
863         case GREEK_BETA_SYMBOL:
864           return (const char *)UNDEFINED;
865         case GREEK_THETA_SYMBOL:
866           return (const char *)"\\( \\vartheta \\)";
867         case GREEK_UPSILON_WITH_HOOK_SYMBOL:
868           return (const char *)UNDEFINED;
869         case GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL:
870           return (const char *)UNDEFINED;
871         case GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL:
872           return (const char *)UNDEFINED;
873         case GREEK_PHI_SYMBOL:
874           return (const char *)"\\( \\phi \\)";
875         case GREEK_PI_SYMBOL:
876           return (const char *)"\\( \\varpi \\)";
877           /* and some greek letters missing*/
878
879         /* punctuation (partial) */
880         case HYPHEN:
881           return (const char *)"-";
882         case NON_BREAKING_HYPHEN:
883           return (const char *)UNDEFINED;
884         case FIGURE_DASH:
885         case EN_DASH:
886           return (const char *)"--";
887         case EM_DASH:
888           return (const char *)"---";
889         case HORIZONTAL_BAR:
890           return (const char *)UNDEFINED;
891         case LEFT_SINGLE_QUOTATION_MARK:
892           return (const char *)"`";
893         case RIGHT_SINGLE_QUOTATION_MARK:
894           return (const char *)"'";
895         case SINGLE_LOW_9_QUOTATION_MARK:
896           return (const char *)"\\glq{}";
897         case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK:
898           return (const char *)UNDEFINED;
899         case LEFT_DOUBLE_QUOTATION_MARK:
900           return (const char *)"``";
901         case RIGHT_DOUBLE_QUOTATION_MARK:
902           return (const char *)"''";
903         case DOUBLE_LOW_9_QUOTATION_MARK:
904           return (const char *)"\\glqq{}";
905         case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK:
906           return (const char *)UNDEFINED;
907         case DAGGER:
908           return (const char *)"\\dag";
909         case DOUBLE_DAGGER:
910           return (const char *)"\\ddag";
911         case BULLET:
912           return (const char *)"$\\bullet$";
913         case TRIANGULAR_BULLET:
914           return (const char *)"$\\blacktriangleright";
915         case HYPHENATION_POINT:
916           return (const char *)"\\-";
917         case HORIZONTAL_ELLIPSIS:
918           return (const char *)"\\ldots";
919         case PER_MILLE_SIGN:
920           return (const char *)UNDEFINED;
921         case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK:
922           return (const char *)"\\flq{}";
923         case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK:
924           return (const char *)"\\frq{}";
925         /* ligatures */
926         case LATIN_SMALL_LIGATURE_FF:
927           return (const char *)"ff";
928         case LATIN_SMALL_LIGATURE_FI:
929           return (const char *)"fi";
930         case LATIN_SMALL_LIGATURE_FL:
931           return (const char *)"fl";
932         case LATIN_SMALL_LIGATURE_FFI:
933           return (const char *)"ffi";
934         case LATIN_SMALL_LIGATURE_FFL:
935           return (const char *)"ffl";
936         case LATIN_SMALL_LIGATURE_LONG_S_T:
937         case LATIN_SMALL_LIGATURE_ST:
938           return (const char *)"st";
939         /* reserved */
940         case 0:
941           return (const char *)"";
942         case UNKNOWN:
943           return (const char *)"\\_";
944         case PICTURE:
945           return (const char *)"(PICTURE)";
946         default:
947           /* snprintf seems to be no standard, so I use insecure sprintf */
948           sprintf(buf,"\\symbol{%u}",(unsigned)c);
949           return buf;  /* UNDEFINED; */
950         }
951     case HTML:
952       if ( c >= SPACE && c <= TILDE ) { /* ASCII */
953         switch (c) {
954           case '&':
955             return (const char *)"&amp;";
956           /* semicolon must not be coded */
957           case '\'':
958             return (const char *)"&apos;";
959           case '"':
960             return (const char *)"&quot;";
961           case '<':
962             return (const char *)"&lt;";
963           case '>':
964             return (const char *)"&gt;";
965         }
966         buf[0] = (char)c;
967         return buf;
968       }
969       switch (c) {
970         case PICTURE:
971           return (const char *)"<!--PICTURE-->";
972         case UNKNOWN:
973           return (const char *)"_"; /* better use colored symbol? */
974         case LINE_FEED:
975           return (const char *)"<br />";  /* \n handled somwhere else? */
976         case FORM_FEED:
977         case CARRIAGE_RETURN:
978           return (const char *)"<br />";
979         case NO_BREAK_SPACE:
980           return (const char *)"<nobr />";
981         case INVERTED_EXCLAMATION_MARK:
982           return (const char *)"&iexcl;";
983         case CENT_SIGN:
984           return (const char *)"&cent;";
985         case POUND_SIGN:
986           return (const char *)"&pound;";
987         case CURRENCY_SIGN:
988           return (const char *)"&curren;";
989         case YEN_SIGN:
990           return (const char *)"&yen;";
991         case BROKEN_BAR:
992           return (const char *)"&brvbar;";
993         case SECTION_SIGN:
994           return (const char *)"&sect;";
995         case DIAERESIS:
996           return (const char *)"&uml;";
997         case COPYRIGHT_SIGN:
998           return (const char *)"&copy;";
999         case FEMININE_ORDINAL_INDICATOR:
1000           return (const char *)"&ordfem;";
1001         case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
1002           return (const char *)"&laquo;";
1003         case NOT_SIGN:
1004           return (const char *)"&not;";
1005         case SOFT_HYPHEN:
1006           return (const char *)"&shy;";
1007         case REGISTERED_SIGN:
1008           return (const char *)"&reg;";
1009         case MACRON:
1010           return (const char *)"&macr;";
1011         case DEGREE_SIGN:
1012           return (const char *)"&deg;";
1013         case PLUS_MINUS_SIGN:
1014           return (const char *)"&plusmn;";
1015         case SUPERSCRIPT_TWO:
1016           return (const char *)"&sup2;";
1017         case SUPERSCRIPT_THREE:
1018           return (const char *)"&sup3;";
1019         case ACUTE_ACCENT:
1020           return (const char *)"&acute;";
1021         case MICRO_SIGN:
1022           return (const char *)"&micro;";
1023         case PILCROW_SIGN:
1024           return (const char *)"&para;";
1025         case MIDDLE_DOT:
1026           return (const char *)"&middot;";
1027         case CEDILLA:
1028           return (const char *)"&cedil;";
1029         case SUPERSCRIPT_ONE:
1030           return (const char *)"&sup1;";
1031         case MASCULINE_ORDINAL_INDICATOR:
1032           return (const char *)"&ordm;";
1033         case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
1034           return (const char *)"&raquo;";
1035         case VULGAR_FRACTION_ONE_QUARTER:
1036           return (const char *)"&frac14;";
1037         case VULGAR_FRACTION_ONE_HALF:
1038           return (const char *)"&frac12;";
1039         case VULGAR_FRACTION_THREE_QUARTERS:
1040           return (const char *)"&frac34;";
1041         case INVERTED_QUESTION_MARK:
1042           return (const char *)"&iquest;";        
1043         case LATIN_CAPITAL_LETTER_A_WITH_GRAVE:
1044           return (const char *)"&Agrave;";
1045         case LATIN_CAPITAL_LETTER_A_WITH_ACUTE:
1046           return (const char *)"&Aacute;";
1047         case LATIN_CAPITAL_LETTER_A_WITH_BREVE:
1048           return (const char *)"&Abreve;";
1049         case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX:
1050           return (const char *)"&Acirc;";
1051         case LATIN_CAPITAL_LETTER_A_WITH_TILDE:
1052           return (const char *)"&Atilde;";
1053         case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS:
1054           return (const char *)"&Auml;";
1055         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
1056           return (const char *)"&Aring;";
1057         case LATIN_CAPITAL_LETTER_AE:
1058           return (const char *)"&AElig;";
1059         case LATIN_CAPITAL_LETTER_C_WITH_CARON:
1060           return (const char *)"&Ccaron;";
1061         case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA:
1062           return (const char *)"&Ccedil;";
1063         case LATIN_CAPITAL_LETTER_E_WITH_GRAVE:
1064           return (const char *)"&Egrave;";
1065         case LATIN_CAPITAL_LETTER_E_WITH_ACUTE:
1066           return (const char *)"&Eacute;";
1067         case LATIN_CAPITAL_LETTER_E_WITH_CARON:
1068           return (const char *)"&Ecaron;";
1069         case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX:
1070           return (const char *)"&Ecirc;";
1071         case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS:
1072           return (const char *)"&Euml;";
1073         case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
1074           return (const char *)"&Igrave;";
1075         case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
1076           return (const char *)"&Iacute;";
1077         case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX:
1078           return (const char *)"&Icirc;";
1079         case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS:
1080           return (const char *)"&Iuml;";
1081         case LATIN_CAPITAL_LETTER_ETH:
1082           return (const char *)"&ETH;";
1083         case LATIN_CAPITAL_LETTER_N_WITH_TILDE:
1084           return (const char *)"&Ntilde;";
1085         case LATIN_CAPITAL_LETTER_O_WITH_GRAVE:
1086           return (const char *)"&Ograve;";
1087         case LATIN_CAPITAL_LETTER_O_WITH_ACUTE:
1088           return (const char *)"&Oacute;";
1089         case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX:
1090           return (const char *)"&Ocirc;";
1091         case LATIN_CAPITAL_LETTER_O_WITH_TILDE:
1092           return (const char *)"&Otilde;";
1093         case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS:
1094           return (const char *)"&Ouml;";
1095         case MULTIPLICATION_SIGN:
1096           return (const char *)"&times";
1097         case LATIN_CAPITAL_LETTER_O_WITH_STROKE:
1098           return (const char *)"&Oslash;";
1099         case LATIN_CAPITAL_LETTER_S_WITH_CARON:
1100           return (const char *)"&Scaron;";
1101         case LATIN_CAPITAL_LETTER_U_WITH_GRAVE:
1102           return (const char *)"&Ugrave;";
1103         case LATIN_CAPITAL_LETTER_U_WITH_ACUTE:
1104           return (const char *)"&Uacute;";
1105         case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX:
1106           return (const char *)"&Ucirc;";
1107         case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS:
1108           return (const char *)"&Uuml;";
1109         case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE:
1110           return (const char *)"&Yacute;";
1111         case LATIN_CAPITAL_LETTER_Z_WITH_CARON:
1112           return (const char *)"&Zcaron;";
1113         case LATIN_CAPITAL_LETTER_THORN:
1114           return (const char *)"&THORN;";
1115         case LATIN_SMALL_LETTER_SHARP_S:
1116           return (const char *)"&szlig;";
1117         case LATIN_SMALL_LETTER_A_WITH_GRAVE:
1118           return (const char *)"&agrave;";
1119         case LATIN_SMALL_LETTER_A_WITH_ACUTE:
1120           return (const char *)"&aacute;";
1121         case LATIN_SMALL_LETTER_A_WITH_BREVE:
1122           return (const char *)"&abreve;";
1123         case LATIN_SMALL_LETTER_A_WITH_CARON:
1124           return (const char *)"&acaron;";
1125         case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX:
1126           return (const char *)"&acirc;";
1127         case LATIN_SMALL_LETTER_A_WITH_TILDE:
1128           return (const char *)"&atilde;";
1129         case LATIN_SMALL_LETTER_A_WITH_DIAERESIS:
1130           return (const char *)"&auml;";
1131         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
1132           return (const char *)"&aring;";
1133         case LATIN_SMALL_LETTER_AE:
1134           return (const char *)"&aelig;";
1135         case LATIN_SMALL_LETTER_C_WITH_CARON:
1136           return (const char *)"&ccaron;";
1137         case LATIN_SMALL_LETTER_C_WITH_CEDILLA:
1138           return (const char *)"&ccedil;";
1139         case LATIN_SMALL_LETTER_E_WITH_GRAVE:
1140           return (const char *)"&egrave;";
1141         case LATIN_SMALL_LETTER_E_WITH_ACUTE:
1142           return (const char *)"&eacute;";
1143         case LATIN_SMALL_LETTER_E_WITH_CARON:
1144           return (const char *)"&ecaron;";
1145         case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX:
1146           return (const char *)"&ecirc;";
1147         case LATIN_SMALL_LETTER_E_WITH_DIAERESIS:
1148           return (const char *)"&euml;";
1149         case LATIN_SMALL_LETTER_I_WITH_GRAVE:
1150           return (const char *)"&igrave;";
1151         case LATIN_SMALL_LETTER_I_WITH_ACUTE:
1152           return (const char *)"&iacute;";
1153         case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX:
1154           return (const char *)"&icirc;";
1155         case LATIN_SMALL_LETTER_I_WITH_DIAERESIS:
1156           return (const char *)"&iuml;";
1157         case LATIN_SMALL_LETTER_ETH:
1158           return (const char *)"&eth;";
1159         case LATIN_SMALL_LETTER_N_WITH_TILDE:
1160           return (const char *)"&ntilde;";
1161         case LATIN_SMALL_LETTER_O_WITH_GRAVE:
1162           return (const char *)"&ograve;";
1163         case LATIN_SMALL_LETTER_O_WITH_ACUTE:
1164           return (const char *)"&oacute;";
1165         case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX:
1166           return (const char *)"&ocirc;";
1167         case LATIN_SMALL_LETTER_O_WITH_TILDE:
1168           return (const char *)"&otilde;";
1169         case LATIN_SMALL_LETTER_O_WITH_DIAERESIS:
1170           return (const char *)"&ouml;";
1171         case DIVISION_SIGN:
1172           return (const char *)"&divide;";
1173         case LATIN_SMALL_LETTER_O_WITH_STROKE:
1174           return (const char *)"&oslash;";
1175         case LATIN_SMALL_LETTER_S_WITH_CARON:
1176           return (const char *)"&scaron;";
1177         case LATIN_SMALL_LETTER_U_WITH_GRAVE:
1178           return (const char *)"&ugrave;";
1179         case LATIN_SMALL_LETTER_U_WITH_ACUTE:
1180           return (const char *)"&uacute;";
1181         case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX:
1182           return (const char *)"&ucirc;";
1183         case LATIN_SMALL_LETTER_U_WITH_DIAERESIS:
1184           return (const char *)"&uuml;";
1185         case LATIN_SMALL_LETTER_Y_WITH_ACUTE:
1186           return (const char *)"&yacute;";
1187         case LATIN_SMALL_LETTER_THORN:
1188           return (const char *)"&thorn;";
1189         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1190           return (const char *)"&yuml;";
1191         case LATIN_SMALL_LETTER_Z_WITH_CARON:
1192           return (const char *)"&zcaron;";
1193         case EURO_CURRENCY_SIGN:
1194           return (const char *)"&euro;";
1195         case 0:
1196           return (const char *)"";
1197         default:
1198           sprintf(buf,"&#%u;",(unsigned)c);
1199           return buf;  /* undefined */
1200       }
1201       /* break; unreachable code */
1202     case XML:   /* only 5 &xxx;-ENTITIES ar defined by default */
1203       if ( c >= SPACE && c <= TILDE ) { /* ASCII */
1204         switch (c) {
1205           case '&':
1206             return (const char *)"&amp;";
1207           case '\'':
1208             return (const char *)"&apos;";
1209           case '"':
1210             return (const char *)"&quot;";
1211           case '<':
1212             return (const char *)"&lt;";
1213           case '>':
1214             return (const char *)"&gt;";
1215         }
1216         buf[0] = (char)c;
1217         return buf;
1218       }
1219       switch (c) {    /* subject of change! */
1220         case PICTURE:
1221           return (const char *)"(PICTURE)";
1222         case UNKNOWN:
1223           return (const char *)"_"; /* better use colored symbol? */
1224         case LINE_FEED:             /* \n handled somwhere else? */
1225         case FORM_FEED:
1226         case CARRIAGE_RETURN:
1227           return (const char *)"<br />";
1228         case NO_BREAK_SPACE:
1229           return (const char *)"<nobr />";
1230         case 0:
1231           return (const char *)"";
1232         default:
1233           sprintf(buf,"&#x%03x;",(unsigned)c);
1234           return buf;  /* undefined */
1235       }
1236       /* break; unreachable code */
1237     case SGML:
1238       switch (c) {
1239         default:
1240           sprintf(buf,"&#%u;",(unsigned)c);
1241           return buf;  /* UNDEFINED */
1242       }
1243       /* break; unreachable code */
1244     case ASCII: /* mainly used for debugging */
1245       if ( c=='\n' || (c>= 0x20 && c <= 0x7F) ) {
1246         buf[0] = (char)c;
1247         return buf;
1248       }
1249       switch (c) {
1250         /* extra */
1251         case UNKNOWN:
1252           return (const char *)"(?)";
1253         case PICTURE:
1254           return (const char *)"(?)";
1255                 
1256         default:
1257           /* snprintf seems to be no standard, so I use insecure sprintf */
1258           if ((unsigned)c>255) sprintf(buf,"(0x%04x)",(unsigned)c);
1259           else                 sprintf(buf,"(0x%02x)",(unsigned)c);
1260           return buf;  /* UNDEFINED; */
1261       }
1262       /* break; unreachable code */
1263     default: /* use UTF8 as default, test with xterm -u8 */
1264       /* extra */
1265       if ( c == UNKNOWN )  return (const char *)"_";
1266       if ( c == PICTURE )  return (const char *)"_"; /* Due to Mobile OCR */
1267       if ( c <= (wchar_t)0x0000007F ) {  /* UTF8 == 7bit ASCII */
1268         buf[0] = (char)c;
1269         return buf;
1270       }
1271       if ( c <= (wchar_t)0x000007FF ) {  /* UTF8 == 11bit */
1272         buf[0] = (char)(0xc0|((c>> 6) & 0x1f)); /* 110xxxxx */
1273         buf[1] = (char)(0x80|( c      & 0x3f)); /* 10xxxxxx */
1274         buf[2] = (char)0; /* terminate string */
1275         return buf;
1276       }
1277       /* wchar_t is 16bit for Borland-C !? Jan07 */
1278       if ( c <= (wchar_t)0x0000FFFF ) {  /* UTF8 == 16bit */
1279         buf[0] = (char)(0xe0|((c>>12) & 0x0f)); /* 1110xxxx */
1280         buf[1] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1281         buf[2] = (char)(0x80|( c      & 0x3f)); /* 10xxxxxx */
1282         buf[3] = (char)0; /* terminate string */
1283         return buf;
1284       }
1285       if ( c <= (wchar_t)0x001FFFFF ) {  /* UTF8 == 21bit */
1286         buf[0] = (char)(0xf0|((c>>18) & 0x07)); /* 11110xxx */
1287         buf[1] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1288         buf[2] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1289         buf[3] = (char)(0x80|( c      & 0x3f)); /* 10xxxxxx */
1290         buf[4] = (char)0; /* terminate string */
1291         return buf;
1292       }
1293       if ( c <= (wchar_t)0x03FFFFFF ) {  /* UTF8 == 26bit */
1294         buf[0] = (char)(0xf8|((c>>24) & 0x03)); /* 111110xx */
1295         buf[1] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */
1296         buf[2] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1297         buf[3] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1298         buf[4] = (char)(0x80|( c      & 0x3f)); /* 10xxxxxx */
1299         buf[5] = (char)0; /* terminate string */
1300         return buf;
1301       }
1302       if ( c <= (wchar_t)0x7FFFFFFF ) {  /* UTF8 == 31bit */
1303         buf[0] = (char)(0xfc|((c>>30) & 0x01)); /* 1111110x */
1304         buf[1] = (char)(0x80|((c>>24) & 0x3f)); /* 10xxxxxx */
1305         buf[2] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */
1306         buf[3] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1307         buf[4] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1308         buf[5] = (char)(0x80|( c      & 0x3f)); /* 10xxxxxx */
1309         buf[6] = (char)0; /* terminate string */
1310         return buf;
1311       }
1312       return (const char *)UNDEFINED;
1313   }
1314 }